cdminix commited on
Commit
e03a54f
1 Parent(s): 026ee6b

change structure

Browse files
app.py CHANGED
@@ -1,35 +1,95 @@
 
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
 
 
 
 
 
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- fields,
23
- )
24
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
25
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
26
- from src.submission.submit import add_new_eval
27
 
28
 
29
  def restart_space():
30
  API.restart_space(repo_id=REPO_ID)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  ### Space initialisation
34
  try:
35
  print(EVAL_REQUESTS_PATH)
@@ -57,134 +117,154 @@ except Exception:
57
  restart_space()
58
 
59
 
60
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- (
63
- finished_eval_queue_df,
64
- running_eval_queue_df,
65
- pending_eval_queue_df,
66
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  def init_leaderboard(dataframe):
70
  if dataframe is None or dataframe.empty:
71
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
 
 
72
  return Leaderboard(
73
  value=dataframe,
74
- datatype=[c.type for c in fields(AutoEvalColumn)],
75
  select_columns=SelectColumns(
76
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
77
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
78
  label="Select Columns to Display:",
79
  ),
80
- search_columns=[AutoEvalColumn.model.name],
81
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
82
- filter_columns=[
83
- ColumnFilter(
84
- AutoEvalColumn.params.name,
85
- type="slider",
86
- min=0.01,
87
- max=150,
88
- label="Select the number of parameters (B)",
89
- ),
90
- ],
91
- bool_checkboxgroup_label="Hide models",
92
  interactive=False,
 
93
  )
94
 
95
 
96
- def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
97
- if profile or True:
98
- gr.HTML(TITLE)
99
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
 
101
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
102
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
103
- leaderboard = init_leaderboard(LEADERBOARD_DF)
104
-
105
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
106
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
107
-
108
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
109
- with gr.Column():
110
- with gr.Row():
111
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
112
-
113
- with gr.Column():
114
- with gr.Accordion(
115
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
116
- open=False,
117
- ):
118
- with gr.Row():
119
- finished_eval_table = gr.components.Dataframe(
120
- value=finished_eval_queue_df,
121
- headers=EVAL_COLS,
122
- datatype=EVAL_TYPES,
123
- row_count=5,
124
- )
125
- with gr.Accordion(
126
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
127
- open=False,
128
- ):
129
- with gr.Row():
130
- running_eval_table = gr.components.Dataframe(
131
- value=running_eval_queue_df,
132
- headers=EVAL_COLS,
133
- datatype=EVAL_TYPES,
134
- row_count=5,
135
- )
136
-
137
- with gr.Accordion(
138
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
139
- open=False,
140
- ):
141
- with gr.Row():
142
- pending_eval_table = gr.components.Dataframe(
143
- value=pending_eval_queue_df,
144
- headers=EVAL_COLS,
145
- datatype=EVAL_TYPES,
146
- row_count=5,
147
- )
148
  with gr.Row():
149
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
150
-
 
151
  with gr.Row():
152
  with gr.Column():
153
  model_name_textbox = gr.Textbox(label="Model name")
154
-
155
- submit_button = gr.Button("Submit Eval")
156
- submission_result = gr.Markdown()
157
- submit_button.click(
158
- add_new_eval,
159
- [
160
- model_name_textbox,
161
- ],
162
- submission_result,
163
- )
164
-
165
- with gr.Row():
166
- with gr.Accordion("📙 Citation", open=False):
167
- citation_button = gr.Textbox(
168
- value=CITATION_BUTTON_TEXT,
169
- label=CITATION_BUTTON_LABEL,
170
- lines=20,
171
- elem_id="citation-button",
172
- show_copy_button=True,
173
- )
174
-
175
-
176
- demo = gr.Blocks(css=custom_css)
177
-
178
- with demo:
179
- # gr.LoginButton()
180
- m1 = gr.Markdown("Please login to see the leaderboard.")
181
- # demo.load(show_leaderboard, inputs=None, outputs=m1)
182
- show_leaderboard(None, None)
183
-
184
 
185
  scheduler = BackgroundScheduler()
186
  scheduler.add_job(restart_space, "interval", seconds=1800)
187
  scheduler.start()
188
 
189
- demo.queue(default_concurrency_limit=40).launch()
190
- # demo.launch()
 
1
+ from pathlib import Path
2
+ import json
3
+
4
  import gradio as gr
5
+ from huggingface_hub import snapshot_download
6
+ from gradio_leaderboard import Leaderboard, SelectColumns
7
  import pandas as pd
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
+ from ttsds.benchmarks.benchmark import BenchmarkCategory
10
+ from ttsds import BenchmarkSuite
11
+
12
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, TAGS
13
+ from src.texts import LLM_BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT
14
+ from src.css_html_js import custom_css
15
 
16
+
17
+ def filter_dfs(tags, lb):
18
+ global f_b_df, f_a_df
19
+ is_agg = False
20
+ if "Environment" in lb.columns:
21
+ is_agg = True
22
+ if is_agg:
23
+ lb = f_a_df.copy()
24
+ else:
25
+ lb = f_b_df.copy()
26
+ if tags and len(lb) > 0:
27
+ lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
28
+ return lb
 
 
 
 
 
 
 
29
 
30
 
31
  def restart_space():
32
  API.restart_space(repo_id=REPO_ID)
33
 
34
 
35
+ def submit_eval(model_name, model_tags, web_url, hf_url, code_url, paper_url, inference_details, file_path):
36
+ model_id = model_name.lower().replace(" ", "_")
37
+ # check if model already exists
38
+ if Path(f"{EVAL_REQUESTS_PATH}/{model_id}.json").exists():
39
+ return "Model already exists in the evaluation queue"
40
+ # check which urls are valid
41
+ if web_url and not web_url.startswith("http"):
42
+ return "Please enter a valid URL"
43
+ if hf_url and not hf_url.startswith("http"):
44
+ return "Please enter a valid URL"
45
+ if code_url and not code_url.startswith("http"):
46
+ return "Please enter a valid URL"
47
+ if paper_url and not paper_url.startswith("http"):
48
+ return "Please enter a valid URL"
49
+ # move file to correct location
50
+ if not file_path.endswith(".tar.gz"):
51
+ return "Please upload a .tar.gz file"
52
+ Path(file_path).rename(f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz")
53
+ # build display name - use web_url to link text if available, and emojis for the other urls
54
+ display_name = model_name
55
+ if web_url:
56
+ display_name = f"[{display_name}]({web_url}) "
57
+ if hf_url:
58
+ display_name += f"[🤗]({hf_url})"
59
+ if code_url:
60
+ display_name += f"[💻]({code_url})"
61
+ if paper_url:
62
+ display_name += f"[📄]({paper_url})"
63
+ request_obj = {
64
+ "model_name": model_name,
65
+ "display_name": display_name,
66
+ "model_tags": model_tags,
67
+ "web_url": web_url,
68
+ "hf_url": hf_url,
69
+ "code_url": code_url,
70
+ "paper_url": paper_url,
71
+ "inference_details": inference_details,
72
+ "status": "pending",
73
+ }
74
+ with open(f"{EVAL_REQUESTS_PATH}/{model_id}.json", "w") as f:
75
+ json.dump(request_obj, f)
76
+ API.upload_file(
77
+ path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.json",
78
+ path_in_repo=f"{model_id}.json",
79
+ repo_id=QUEUE_REPO,
80
+ repo_type="dataset",
81
+ commit_message=f"Add {model_name} to evaluation queue",
82
+ )
83
+ API.upload_file(
84
+ path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz",
85
+ path_in_repo=f"{model_id}.tar.gz",
86
+ repo_id=QUEUE_REPO,
87
+ repo_type="dataset",
88
+ commit_message=f"Add {model_name} to evaluation queue",
89
+ )
90
+ return "Model submitted successfully 🎉"
91
+
92
+
93
  ### Space initialisation
94
  try:
95
  print(EVAL_REQUESTS_PATH)
 
117
  restart_space()
118
 
119
 
120
+ results_df = pd.read_csv(EVAL_RESULTS_PATH + "/results.csv")
121
+
122
+ agg_df = BenchmarkSuite.aggregate_df(results_df)
123
+ agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
124
+ agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
125
+ agg_df.columns = [x.capitalize() for x in agg_df.columns]
126
+ agg_df["Mean"] = agg_df.mean(axis=1)
127
+ # make sure mean is the first column
128
+ agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
129
+ for col in agg_df.columns:
130
+ agg_df[col] = agg_df[col].apply(lambda x: round(x, 2))
131
+ agg_df["Tags"] = ""
132
+ agg_df.reset_index(inplace=True)
133
+ agg_df.rename(columns={"dataset": "Model"}, inplace=True)
134
+ agg_df.sort_values("Mean", ascending=False, inplace=True)
135
+
136
+ benchmark_df = results_df.pivot(index="dataset", columns="benchmark_name", values="score")
137
+
138
+ # get benchmark name order by category
139
+ benchmark_order = list(results_df.sort_values("benchmark_category")["benchmark_name"].unique())
140
+ benchmark_df = benchmark_df[benchmark_order]
141
+ benchmark_df = benchmark_df.reset_index()
142
+ benchmark_df.rename(columns={"dataset": "Model"}, inplace=True)
143
+ # set index
144
+ benchmark_df.set_index("Model", inplace=True)
145
+ benchmark_df["Mean"] = benchmark_df.mean(axis=1)
146
+ # make sure mean is the first column
147
+ benchmark_df = benchmark_df[["Mean"] + [col for col in benchmark_df.columns if col != "Mean"]]
148
+ # round all
149
+ for col in benchmark_df.columns:
150
+ benchmark_df[col] = benchmark_df[col].apply(lambda x: round(x, 2))
151
+ benchmark_df["Tags"] = ""
152
+ benchmark_df.reset_index(inplace=True)
153
+ benchmark_df.sort_values("Mean", ascending=False, inplace=True)
154
 
155
+ # get details for each model
156
+ model_detail_files = Path(EVAL_REQUESTS_PATH).glob("*.json")
157
+ model_details = {}
158
+ for model_detail_file in model_detail_files:
159
+ with open(model_detail_file) as f:
160
+ model_detail = json.load(f)
161
+ model_details[model_detail_file.stem] = model_detail
162
+
163
+ # replace .tar.gz
164
+ benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
165
+ agg_df["Model"] = agg_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
166
+
167
+ benchmark_df["Tags"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
168
+ agg_df["Tags"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
169
+
170
+ benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
171
+ agg_df["Model"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
172
+
173
+ f_b_df = benchmark_df.copy()
174
+ f_a_df = agg_df.copy()
175
 
176
 
177
  def init_leaderboard(dataframe):
178
  if dataframe is None or dataframe.empty:
179
  raise ValueError("Leaderboard DataFrame is empty or None.")
180
+ df_types = []
181
+ for col in dataframe.columns:
182
+ if col == "Model":
183
+ df_types.append("markdown")
184
+ elif col == "Tags":
185
+ df_types.append("markdown")
186
+ else:
187
+ df_types.append("number")
188
  return Leaderboard(
189
  value=dataframe,
 
190
  select_columns=SelectColumns(
191
+ default_selection=list(dataframe.columns),
192
+ cant_deselect=["Model", "Mean"],
193
  label="Select Columns to Display:",
194
  ),
195
+ search_columns=["Model", "Tags"],
196
+ filter_columns=[],
197
+ hide_columns=["Tags"],
 
 
 
 
 
 
 
 
 
198
  interactive=False,
199
+ datatype=df_types,
200
  )
201
 
202
 
203
+ app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")
 
 
 
204
 
205
+ with app:
206
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
207
+ with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
208
+ tags = gr.Dropdown(
209
+ TAGS,
210
+ value=[],
211
+ multiselect=True,
212
+ label="Tags",
213
+ info="Select tags to filter the leaderboard. You can suggest new tags here: https://huggingface.co/spaces/ttsds/benchmark/discussions/1",
214
+ )
215
+ leaderboard = init_leaderboard(f_a_df)
216
+ tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
217
+ with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
218
+ tags = gr.Dropdown(
219
+ TAGS,
220
+ value=[],
221
+ multiselect=True,
222
+ label="Tags",
223
+ info="Select tags to filter the leaderboard",
224
+ )
225
+ leaderboard = init_leaderboard(f_b_df)
226
+ tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
227
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
228
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
229
+ with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
230
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with gr.Row():
232
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
233
+ with gr.Row():
234
+ gr.Markdown("# ✉️✨ Submit a TTS dataset here!", elem_classes="markdown-text")
235
  with gr.Row():
236
  with gr.Column():
237
  model_name_textbox = gr.Textbox(label="Model name")
238
+ model_tags_dropdown = gr.Dropdown(
239
+ label="Model tags",
240
+ choices=TAGS,
241
+ multiselect=True,
242
+ )
243
+ website_url_textbox = gr.Textbox(label="Website URL (optional)")
244
+ hf_url_textbox = gr.Textbox(label="Huggingface URL (optional)")
245
+ code_url_textbox = gr.Textbox(label="Code URL (optional)")
246
+ paper_url_textbox = gr.Textbox(label="Paper URL (optional)")
247
+ inference_details_textbox = gr.TextArea(label="Inference details (optional)")
248
+ file_input = gr.File(file_types=[".gz"], interactive=True, label=".tar.gz TTS dataset")
249
+ submit_button = gr.Button("Submit Eval")
250
+ submission_result = gr.Markdown()
251
+ submit_button.click(
252
+ submit_eval,
253
+ [
254
+ model_name_textbox,
255
+ model_tags_dropdown,
256
+ website_url_textbox,
257
+ hf_url_textbox,
258
+ code_url_textbox,
259
+ paper_url_textbox,
260
+ inference_details_textbox,
261
+ file_input,
262
+ ],
263
+ submission_result,
264
+ )
 
 
 
265
 
266
  scheduler = BackgroundScheduler()
267
  scheduler.add_job(restart_space, "interval", seconds=1800)
268
  scheduler.start()
269
 
270
+ app.queue(default_concurrency_limit=40).launch()
 
new/app.py DELETED
@@ -1,270 +0,0 @@
1
- from pathlib import Path
2
- import json
3
-
4
- import gradio as gr
5
- from huggingface_hub import snapshot_download
6
- from gradio_leaderboard import Leaderboard, SelectColumns
7
- import pandas as pd
8
- from apscheduler.schedulers.background import BackgroundScheduler
9
- from ttsdb.benchmarks.benchmark import BenchmarkCategory
10
- from ttsdb import BenchmarkSuite
11
-
12
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, TAGS
13
- from src.texts import LLM_BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT
14
- from src.css_html_js import custom_css
15
-
16
-
17
- def filter_dfs(tags, lb):
18
- global f_b_df, f_a_df
19
- is_agg = False
20
- if "Environment" in lb.columns:
21
- is_agg = True
22
- if is_agg:
23
- lb = f_a_df.copy()
24
- else:
25
- lb = f_b_df.copy()
26
- if tags and len(lb) > 0:
27
- lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
28
- return lb
29
-
30
-
31
- def restart_space():
32
- API.restart_space(repo_id=REPO_ID)
33
-
34
-
35
- def submit_eval(model_name, model_tags, web_url, hf_url, code_url, paper_url, inference_details, file_path):
36
- model_id = model_name.lower().replace(" ", "_")
37
- # check if model already exists
38
- if Path(f"{EVAL_REQUESTS_PATH}/{model_id}.json").exists():
39
- return "Model already exists in the evaluation queue"
40
- # check which urls are valid
41
- if web_url and not web_url.startswith("http"):
42
- return "Please enter a valid URL"
43
- if hf_url and not hf_url.startswith("http"):
44
- return "Please enter a valid URL"
45
- if code_url and not code_url.startswith("http"):
46
- return "Please enter a valid URL"
47
- if paper_url and not paper_url.startswith("http"):
48
- return "Please enter a valid URL"
49
- # move file to correct location
50
- if not file_path.endswith(".tar.gz"):
51
- return "Please upload a .tar.gz file"
52
- Path(file_path).rename(f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz")
53
- # build display name - use web_url to link text if available, and emojis for the other urls
54
- display_name = model_name
55
- if web_url:
56
- display_name = f"[{display_name}]({web_url}) "
57
- if hf_url:
58
- display_name += f"[🤗]({hf_url})"
59
- if code_url:
60
- display_name += f"[💻]({code_url})"
61
- if paper_url:
62
- display_name += f"[📄]({paper_url})"
63
- request_obj = {
64
- "model_name": model_name,
65
- "display_name": display_name,
66
- "model_tags": model_tags,
67
- "web_url": web_url,
68
- "hf_url": hf_url,
69
- "code_url": code_url,
70
- "paper_url": paper_url,
71
- "inference_details": inference_details,
72
- "status": "pending",
73
- }
74
- with open(f"{EVAL_REQUESTS_PATH}/{model_id}.json", "w") as f:
75
- json.dump(request_obj, f)
76
- API.upload_file(
77
- path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.json",
78
- path_in_repo=f"{model_id}.json",
79
- repo_id=QUEUE_REPO,
80
- repo_type="dataset",
81
- commit_message=f"Add {model_name} to evaluation queue",
82
- )
83
- API.upload_file(
84
- path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz",
85
- path_in_repo=f"{model_id}.tar.gz",
86
- repo_id=QUEUE_REPO,
87
- repo_type="dataset",
88
- commit_message=f"Add {model_name} to evaluation queue",
89
- )
90
- return "Model submitted successfully 🎉"
91
-
92
-
93
- ### Space initialisation
94
- try:
95
- print(EVAL_REQUESTS_PATH)
96
- snapshot_download(
97
- repo_id=QUEUE_REPO,
98
- local_dir=EVAL_REQUESTS_PATH,
99
- repo_type="dataset",
100
- tqdm_class=None,
101
- etag_timeout=30,
102
- token=TOKEN,
103
- )
104
- except Exception:
105
- restart_space()
106
- try:
107
- print(EVAL_RESULTS_PATH)
108
- snapshot_download(
109
- repo_id=RESULTS_REPO,
110
- local_dir=EVAL_RESULTS_PATH,
111
- repo_type="dataset",
112
- tqdm_class=None,
113
- etag_timeout=30,
114
- token=TOKEN,
115
- )
116
- except Exception:
117
- restart_space()
118
-
119
-
120
- results_df = pd.read_csv(EVAL_RESULTS_PATH + "/results.csv")
121
-
122
- agg_df = BenchmarkSuite.aggregate_df(results_df)
123
- agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
124
- agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
125
- agg_df.columns = [x.capitalize() for x in agg_df.columns]
126
- agg_df["Mean"] = agg_df.mean(axis=1)
127
- # make sure mean is the first column
128
- agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
129
- for col in agg_df.columns:
130
- agg_df[col] = agg_df[col].apply(lambda x: round(x, 2))
131
- agg_df["Tags"] = ""
132
- agg_df.reset_index(inplace=True)
133
- agg_df.rename(columns={"dataset": "Model"}, inplace=True)
134
- agg_df.sort_values("Mean", ascending=False, inplace=True)
135
-
136
- benchmark_df = results_df.pivot(index="dataset", columns="benchmark_name", values="score")
137
-
138
- # get benchmark name order by category
139
- benchmark_order = list(results_df.sort_values("benchmark_category")["benchmark_name"].unique())
140
- benchmark_df = benchmark_df[benchmark_order]
141
- benchmark_df = benchmark_df.reset_index()
142
- benchmark_df.rename(columns={"dataset": "Model"}, inplace=True)
143
- # set index
144
- benchmark_df.set_index("Model", inplace=True)
145
- benchmark_df["Mean"] = benchmark_df.mean(axis=1)
146
- # make sure mean is the first column
147
- benchmark_df = benchmark_df[["Mean"] + [col for col in benchmark_df.columns if col != "Mean"]]
148
- # round all
149
- for col in benchmark_df.columns:
150
- benchmark_df[col] = benchmark_df[col].apply(lambda x: round(x, 2))
151
- benchmark_df["Tags"] = ""
152
- benchmark_df.reset_index(inplace=True)
153
- benchmark_df.sort_values("Mean", ascending=False, inplace=True)
154
-
155
- # get details for each model
156
- model_detail_files = Path(EVAL_REQUESTS_PATH).glob("*.json")
157
- model_details = {}
158
- for model_detail_file in model_detail_files:
159
- with open(model_detail_file) as f:
160
- model_detail = json.load(f)
161
- model_details[model_detail_file.stem] = model_detail
162
-
163
- # replace .tar.gz
164
- benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
165
- agg_df["Model"] = agg_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
166
-
167
- benchmark_df["Tags"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
168
- agg_df["Tags"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
169
-
170
- benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
171
- agg_df["Model"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
172
-
173
- f_b_df = benchmark_df.copy()
174
- f_a_df = agg_df.copy()
175
-
176
-
177
- def init_leaderboard(dataframe):
178
- if dataframe is None or dataframe.empty:
179
- raise ValueError("Leaderboard DataFrame is empty or None.")
180
- df_types = []
181
- for col in dataframe.columns:
182
- if col == "Model":
183
- df_types.append("markdown")
184
- elif col == "Tags":
185
- df_types.append("markdown")
186
- else:
187
- df_types.append("number")
188
- return Leaderboard(
189
- value=dataframe,
190
- select_columns=SelectColumns(
191
- default_selection=list(dataframe.columns),
192
- cant_deselect=["Model", "Mean"],
193
- label="Select Columns to Display:",
194
- ),
195
- search_columns=["Model", "Tags"],
196
- filter_columns=[],
197
- hide_columns=["Tags"],
198
- interactive=False,
199
- datatype=df_types,
200
- )
201
-
202
-
203
- app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")
204
-
205
- with app:
206
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
207
- with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
208
- tags = gr.Dropdown(
209
- TAGS,
210
- value=[],
211
- multiselect=True,
212
- label="Tags",
213
- info="Select tags to filter the leaderboard. You can suggest new tags here: https://huggingface.co/spaces/ttsds/benchmark/discussions/1",
214
- )
215
- leaderboard = init_leaderboard(f_a_df)
216
- tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
217
- with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
218
- tags = gr.Dropdown(
219
- TAGS,
220
- value=[],
221
- multiselect=True,
222
- label="Tags",
223
- info="Select tags to filter the leaderboard",
224
- )
225
- leaderboard = init_leaderboard(f_b_df)
226
- tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
227
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
228
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
229
- with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
230
- with gr.Column():
231
- with gr.Row():
232
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
233
- with gr.Row():
234
- gr.Markdown("# ✉️✨ Submit a TTS dataset here!", elem_classes="markdown-text")
235
- with gr.Row():
236
- with gr.Column():
237
- model_name_textbox = gr.Textbox(label="Model name")
238
- model_tags_dropdown = gr.Dropdown(
239
- label="Model tags",
240
- choices=TAGS,
241
- multiselect=True,
242
- )
243
- website_url_textbox = gr.Textbox(label="Website URL (optional)")
244
- hf_url_textbox = gr.Textbox(label="Huggingface URL (optional)")
245
- code_url_textbox = gr.Textbox(label="Code URL (optional)")
246
- paper_url_textbox = gr.Textbox(label="Paper URL (optional)")
247
- inference_details_textbox = gr.TextArea(label="Inference details (optional)")
248
- file_input = gr.File(file_types=[".gz"], interactive=True, label=".tar.gz TTS dataset")
249
- submit_button = gr.Button("Submit Eval")
250
- submission_result = gr.Markdown()
251
- submit_button.click(
252
- submit_eval,
253
- [
254
- model_name_textbox,
255
- model_tags_dropdown,
256
- website_url_textbox,
257
- hf_url_textbox,
258
- code_url_textbox,
259
- paper_url_textbox,
260
- inference_details_textbox,
261
- file_input,
262
- ],
263
- submission_result,
264
- )
265
-
266
- scheduler = BackgroundScheduler()
267
- scheduler.add_job(restart_space, "interval", seconds=1800)
268
- scheduler.start()
269
-
270
- app.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
new/requirements.txt DELETED
@@ -1,17 +0,0 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.9
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
17
- markdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
new/src/envs.py DELETED
@@ -1,38 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
-
9
- OWNER = "ttsds" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH = os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
26
-
27
- TAGS = [
28
- "Normalizing Flow",
29
- "Reference-based (Speaker)",
30
- "Prompt-based (Speaker)",
31
- "Prosodic Correlates",
32
- "Adversarial",
33
- "Diffusion",
34
- "Audio Tokens",
35
- "Autoregressive",
36
- "Non-autoregressive",
37
- "Pretrained Text Encoder",
38
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -13,4 +13,5 @@ python-dateutil
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
+ sentencepiece
17
+ ttsds
src/about.py DELETED
@@ -1,60 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
-
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
- category: str
11
-
12
-
13
- # Select your tasks here
14
- # ---------------------------------------------------
15
- class Tasks(Enum):
16
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
- task0 = Task("anli_r1", "acc", "ANLI", "")
18
- task1 = Task("logiqa", "acc_norm", "LogiQA", "")
19
-
20
-
21
- NUM_FEWSHOT = 0 # Change with your few shot
22
- # ---------------------------------------------------
23
-
24
-
25
- # Your leaderboard name
26
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
27
-
28
- # What does your leaderboard evaluate?
29
- INTRODUCTION_TEXT = """
30
- Intro text
31
- """
32
-
33
- # Which evaluations are you running? how can people reproduce what you have?
34
- LLM_BENCHMARKS_TEXT = f"""
35
- ## How it works
36
-
37
- ## Reproducibility
38
- To reproduce our results, check out our repository [here](https://github.com/ttsds/ttsds).
39
-
40
- """
41
-
42
- EVALUATION_QUEUE_TEXT = """
43
- ## How to submit a TTS model to the leaderboard
44
-
45
- ### 1) download the evaluation dataset
46
- The evaluation dataset consists of wav / text pairs.
47
- You can download it [here](https://huggingface.co/ttsds/eval).
48
-
49
- ### 2) create your TTS dataset
50
- Create a dataset with your TTS model and the evaluation dataset.
51
- Use the wav files as speaker reference and the text as the prompt.
52
- Create a .tar.gz file with the dataset, and make sure to inlcude .wav files and .txt files.
53
-
54
- ### 3) submit your TTS dataset
55
- Submit your dataset below.
56
- """
57
-
58
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
59
- CITATION_BUTTON_TEXT = r"""
60
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{new/src → src}/css_html_js.py RENAMED
File without changes
src/display/css_html_js.py DELETED
@@ -1,105 +0,0 @@
1
- custom_css = """
2
-
3
- .markdown-text {
4
- font-size: 16px !important;
5
- }
6
-
7
- #models-to-add-text {
8
- font-size: 18px !important;
9
- }
10
-
11
- #citation-button span {
12
- font-size: 16px !important;
13
- }
14
-
15
- #citation-button textarea {
16
- font-size: 16px !important;
17
- }
18
-
19
- #citation-button > label > button {
20
- margin: 6px;
21
- transform: scale(1.3);
22
- }
23
-
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
- #search-bar-table-box > div:first-child {
33
- background: none;
34
- border: none;
35
- }
36
-
37
- #search-bar {
38
- padding: 0px;
39
- }
40
-
41
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
- table td:first-child,
43
- table th:first-child {
44
- max-width: 400px;
45
- overflow: auto;
46
- white-space: nowrap;
47
- }
48
-
49
- .tab-buttons button {
50
- font-size: 20px;
51
- }
52
-
53
- #scale-logo {
54
- border-style: none !important;
55
- box-shadow: none;
56
- display: block;
57
- margin-left: auto;
58
- margin-right: auto;
59
- max-width: 600px;
60
- }
61
-
62
- #scale-logo .download {
63
- display: none;
64
- }
65
- #filter_type{
66
- border: 0;
67
- padding-left: 0;
68
- padding-top: 0;
69
- }
70
- #filter_type label {
71
- display: flex;
72
- }
73
- #filter_type label > span{
74
- margin-top: var(--spacing-lg);
75
- margin-right: 0.5em;
76
- }
77
- #filter_type label > .wrap{
78
- width: 103px;
79
- }
80
- #filter_type label > .wrap .wrap-inner{
81
- padding: 2px;
82
- }
83
- #filter_type label > .wrap .wrap-inner input{
84
- width: 1px
85
- }
86
- #filter-columns-type{
87
- border:0;
88
- padding:0.5;
89
- }
90
- #filter-columns-size{
91
- border:0;
92
- padding:0.5;
93
- }
94
- #box-filter > .form{
95
- border: 0
96
- }
97
- """
98
-
99
- get_window_url_params = """
100
- function(url_params) {
101
- const params = new URLSearchParams(window.location.search);
102
- url_params = Object.fromEntries(params);
103
- return url_params;
104
- }
105
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,58 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
-
9
- def fields(raw_class):
10
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
-
12
-
13
- # These classes are for user facing column names,
14
- # to avoid having to change them all around the code
15
- # when a modif is needed
16
- @dataclass
17
- class ColumnContent:
18
- name: str
19
- type: str
20
- displayed_by_default: bool
21
- hidden: bool = False
22
- never_hidden: bool = False
23
-
24
-
25
- @dataclass(frozen=True)
26
- class AutoEvalColumn:
27
- model = ColumnContent("model", "markdown", True, never_hidden=True)
28
- average = ColumnContent("average", "number", True)
29
- general = ColumnContent("general", "number", True)
30
- speaker = ColumnContent("speaker", "number", True)
31
- prosody = ColumnContent("prosody", "number", True)
32
- intelligibility = ColumnContent("intelligibility", "number", True)
33
- environment = ColumnContent("environment", "number", True)
34
- tags = ColumnContent("tags", "str", False)
35
-
36
-
37
- ## For the queue columns in the submission tab
38
- @dataclass(frozen=True)
39
- class EvalQueueColumn: # Queue column
40
- model = ColumnContent("model", "markdown", True)
41
- status = ColumnContent("status", "str", True)
42
-
43
-
44
- ## All the model information that we might need
45
- @dataclass
46
- class ModelDetails:
47
- name: str
48
- display_name: str = ""
49
- symbol: str = "" # emoji
50
-
51
-
52
- # Column selection
53
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
54
-
55
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
56
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
57
-
58
- BENCHMARK_COLS = ["general", "speaker", "prosody", "intelligibility", "environment"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -23,3 +23,16 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
26
+
27
+ TAGS = [
28
+ "Normalizing Flow",
29
+ "Reference-based (Speaker)",
30
+ "Prompt-based (Speaker)",
31
+ "Prosodic Correlates",
32
+ "Adversarial",
33
+ "Diffusion",
34
+ "Audio Tokens",
35
+ "Autoregressive",
36
+ "Non-autoregressive",
37
+ "Pretrained Text Encoder",
38
+ ]
src/leaderboard/read_evals.py DELETED
@@ -1,129 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, Tasks
12
-
13
-
14
- @dataclass
15
- class EvalResult:
16
- """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
-
18
- model_id: str
19
- results: dict
20
- date: str = "" # submission date of request file
21
-
22
- @classmethod
23
- def init_from_json_file(self, json_filepath):
24
- """Inits the result from the specific model result file"""
25
- with open(json_filepath) as fp:
26
- data = json.load(fp)
27
-
28
- config = data.get("config")
29
-
30
- # Extract model info
31
- model = config.get("model_name", "")
32
-
33
- # Extract results available in this file (some results are split in several files)
34
- results = {}
35
- for task in Tasks:
36
- task = task.value
37
-
38
- # We average all scores of a given metric (not all metrics are present in all files)
39
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
40
- if accs.size == 0 or any([acc is None for acc in accs]):
41
- continue
42
-
43
- mean_acc = np.mean(accs) * 100.0
44
- results[task.benchmark] = mean_acc
45
-
46
- return self(
47
- model_id=model,
48
- results=results,
49
- )
50
-
51
- def update_with_request_file(self, requests_path):
52
- """Finds the relevant request file for the current model and updates info with it"""
53
- request_file = get_request_file_for_model(requests_path, self.full_model)
54
-
55
- try:
56
- with open(request_file, "r") as f:
57
- request = json.load(f)
58
- self.model_id = request.get("model", self.model_id)
59
- self.results
60
- except Exception:
61
- print(
62
- f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
63
- )
64
-
65
- def to_dict(self):
66
- """Converts the Eval Result to a dict compatible with our dataframe display"""
67
- data_dict = {
68
-
69
-
70
-
71
- def get_request_file_for_model(requests_path, model_name):
72
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
73
- request_files = os.path.join(
74
- requests_path,
75
- f"{model_name}_eval_request_*.json",
76
- )
77
- request_files = glob.glob(request_files)
78
-
79
- # Select correct request file
80
- request_file = ""
81
- request_files = sorted(request_files, reverse=True)
82
- for tmp_request_file in request_files:
83
- with open(tmp_request_file, "r") as f:
84
- req_content = json.load(f)
85
- if req_content["status"] in ["FINISHED"]:
86
- request_file = tmp_request_file
87
- return request_file
88
-
89
-
90
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
91
- """From the path of the results folder root, extract all needed info for results"""
92
- model_result_filepaths = []
93
-
94
- for root, _, files in os.walk(results_path):
95
- # We should only have json files in model results
96
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
97
- continue
98
-
99
- # Sort the files by date
100
- try:
101
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
102
- except dateutil.parser._parser.ParserError:
103
- files = [files[-1]]
104
-
105
- for file in files:
106
- model_result_filepaths.append(os.path.join(root, file))
107
-
108
- eval_results = {}
109
- for model_result_filepath in model_result_filepaths:
110
- # Creation of result
111
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
112
- eval_result.update_with_request_file(requests_path)
113
-
114
- # Store results of same eval together
115
- eval_name = eval_result.eval_name
116
- if eval_name in eval_results.keys():
117
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
118
- else:
119
- eval_results[eval_name] = eval_result
120
-
121
- results = []
122
- for v in eval_results.values():
123
- try:
124
- v.to_dict() # we test if the dict version is complete
125
- results.append(v)
126
- except KeyError: # not all eval values present
127
- continue
128
-
129
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,36 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
-
14
- def already_submitted_models(requested_models_dir: str) -> set[str]:
15
- """Gather a list of already submitted models to avoid duplicates"""
16
- depth = 1
17
- file_names = []
18
- users_to_submission_dates = defaultdict(list)
19
-
20
- for root, _, files in os.walk(requested_models_dir):
21
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
22
- if current_depth == depth:
23
- for file in files:
24
- if not file.endswith(".json"):
25
- continue
26
- with open(os.path.join(root, file), "r") as f:
27
- info = json.load(f)
28
- file_names.append(f"{info['model']}_{info['revision']}")
29
-
30
- # Select organisation
31
- if info["model"].count("/") == 0 or "submitted_time" not in info:
32
- continue
33
- organisation, _ = info["model"].split("/")
34
- users_to_submission_dates[organisation].append(info["submitted_time"])
35
-
36
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,69 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
- from typing import List
5
-
6
- from src.display.formatting import styled_error, styled_message, styled_warning
7
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
8
- from src.submission.check_validity import already_submitted_models
9
-
10
-
11
- REQUESTED_MODELS = None
12
- USERS_TO_SUBMISSION_DATES = None
13
-
14
-
15
- def add_new_eval(
16
- model: str,
17
- tags: List[str],
18
- ):
19
- global REQUESTED_MODELS
20
- global USERS_TO_SUBMISSION_DATES
21
- if not REQUESTED_MODELS:
22
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
23
-
24
- user_name = ""
25
- model_name = model
26
-
27
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
28
-
29
- # Does the model actually exist?
30
- if revision == "":
31
- revision = "main"
32
-
33
- # Seems good, creating the eval
34
- print("Adding new eval")
35
-
36
- eval_entry = {
37
- "model": model,
38
- "status": "PENDING",
39
- "submitted_time": current_time,
40
- "private": False,
41
- }
42
-
43
- # Check for duplicate submission
44
- if f"{model}_{revision}" in REQUESTED_MODELS:
45
- return styled_warning("This model has been already submitted.")
46
-
47
- print("Creating eval file")
48
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
49
- os.makedirs(OUT_DIR, exist_ok=True)
50
- out_path = f"{OUT_DIR}/{model_name}_eval_request_False.json"
51
-
52
- with open(out_path, "w") as f:
53
- f.write(json.dumps(eval_entry))
54
-
55
- print("Uploading eval file")
56
- API.upload_file(
57
- path_or_fileobj=out_path,
58
- path_in_repo=out_path.split("eval-queue/")[1],
59
- repo_id=QUEUE_REPO,
60
- repo_type="dataset",
61
- commit_message=f"Add {model} to eval queue",
62
- )
63
-
64
- # Remove the local file
65
- os.remove(out_path)
66
-
67
- return styled_message(
68
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
69
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{new/src → src}/texts.py RENAMED
File without changes