Lisa Dunlap commited on
Commit
e121d4e
β€’
1 Parent(s): df2a130

moved around text for asthetics purposes

Browse files
Files changed (1) hide show
  1. app.py +32 -60
app.py CHANGED
@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
26
  | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
 
28
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
- We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
30
  """
31
  return leaderboard_md
32
 
@@ -44,34 +44,10 @@ def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
44
  leaderboard_md = f"""
45
  Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
46
 
47
- Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
48
-
49
  **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
50
  """
51
  return leaderboard_md
52
 
53
- # def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
54
- # # Calculate totals for each arena
55
- # total_votes = sum(arena_df["num_battles"]) // 2
56
- # total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
57
- # total_long_votes = sum(arena_long_df["num_battles"]) // 2
58
- # total_english_votes = sum(arena_english_df["num_battles"]) // 2
59
-
60
- # # Constructing the markdown table
61
- # leaderboard_md = f"""
62
- # Last updated: March 29, 2024.
63
- # | | **Total** | English | Chinese | Long Context |
64
- # | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
65
- # | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
66
- # | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
67
-
68
- # Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
69
- # """
70
-
71
- # return leaderboard_md
72
-
73
-
74
-
75
  def make_full_leaderboard_md(elo_results):
76
  leaderboard_md = f"""
77
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
@@ -301,8 +277,7 @@ def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_
301
  p2 = elo_subset_results["battle_count_heatmap"]
302
  p3 = elo_subset_results["bootstrap_elo_rating"]
303
  p4 = elo_subset_results["average_win_rate_bar"]
304
- more_stats_md = f"""## More Statistics for Chatbot Arena ({button})\n
305
- You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
306
  """
307
  leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
308
  return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
@@ -383,11 +358,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
383
  column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
384
  wrap=True,
385
  )
386
- # Setup the button click action
387
- # overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
388
- # english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
389
- # chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
390
- # long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
391
 
392
  with gr.Tab("Full Leaderboard", id=1):
393
  md = make_full_leaderboard_md(elo_results)
@@ -422,7 +392,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
422
  gr.Markdown(
423
  f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
424
  A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
425
- See Figure 3 below for visualization of the confidence intervals.
426
  """,
427
  elem_id="leaderboard_markdown"
428
  )
@@ -430,32 +400,31 @@ See Figure 3 below for visualization of the confidence intervals.
430
  leader_component_values[:] = [default_md, p1, p2, p3, p4]
431
 
432
  if show_plot:
433
- more_stats_md = gr.Markdown(
434
- f"""## More Statistics for Chatbot Arena\n
435
- You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
436
- """,
437
- elem_id="leaderboard_markdown"
438
- )
439
  with gr.Row():
440
  with gr.Column():
441
  gr.Markdown(
442
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
443
  )
444
- plot_1 = gr.Plot(p1, show_label=False)
445
  with gr.Column():
446
  gr.Markdown(
447
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
448
  )
449
  plot_2 = gr.Plot(p2, show_label=False)
450
  with gr.Row():
451
  with gr.Column():
452
  gr.Markdown(
453
- "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
454
  )
455
  plot_3 = gr.Plot(p3, show_label=False)
456
  with gr.Column():
457
  gr.Markdown(
458
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
459
  )
460
  plot_4 = gr.Plot(p4, show_label=False)
461
 
@@ -494,21 +463,23 @@ block_css = """
494
  line-height: 0.1em;
495
  }
496
 
497
- #arena_leaderboard_dataframe td {
498
- line-height: 0.15em;
499
- font-size: 18px;
500
- }
501
- #arena_leaderboard_dataframe th {
502
- font-size: 20px;
503
  }
504
 
505
-
506
- #full_leaderboard_dataframe td {
507
- line-height: 0.15em;
508
- font-size: 18px;
509
- }
510
- #full_leaderboard_dataframe th {
511
- font-size: 20px;
 
 
 
 
 
512
  }
513
 
514
  footer {
@@ -540,11 +511,12 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
540
 
541
  def build_demo(elo_results_file, leaderboard_table_file):
542
  text_size = gr.themes.sizes.text_lg
543
-
 
544
  with gr.Blocks(
545
  title="Chatbot Arena Leaderboard",
546
- # theme=gr.themes.Base(text_size=text_size),
547
- theme = gr.themes.Base.load("theme.json"),
548
  css=block_css,
549
  ) as demo:
550
  leader_components = build_leaderboard_tab(
 
26
  | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
 
28
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
+ We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
30
  """
31
  return leaderboard_md
32
 
 
44
  leaderboard_md = f"""
45
  Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
46
 
 
 
47
  **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
48
  """
49
  return leaderboard_md
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def make_full_leaderboard_md(elo_results):
52
  leaderboard_md = f"""
53
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
 
277
  p2 = elo_subset_results["battle_count_heatmap"]
278
  p3 = elo_subset_results["bootstrap_elo_rating"]
279
  p4 = elo_subset_results["average_win_rate_bar"]
280
+ more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
 
281
  """
282
  leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
283
  return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
 
358
  column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
359
  wrap=True,
360
  )
 
 
 
 
 
361
 
362
  with gr.Tab("Full Leaderboard", id=1):
363
  md = make_full_leaderboard_md(elo_results)
 
392
  gr.Markdown(
393
  f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
394
  A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
395
+ See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
396
  """,
397
  elem_id="leaderboard_markdown"
398
  )
 
400
  leader_component_values[:] = [default_md, p1, p2, p3, p4]
401
 
402
  if show_plot:
403
+ # more_stats_md = gr.Markdown(
404
+ # f"""## More Statistics for Chatbot Arena (Overall)""",
405
+ # elem_id="leaderboard_markdown"
406
+ # )
407
+ more_stats_md = gr.Button("More Statistics for Chatbot Arena (Overall)", elem_id="non-interactive-button")
 
408
  with gr.Row():
409
  with gr.Column():
410
  gr.Markdown(
411
+ "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", variant="panel"
412
  )
413
+ plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
414
  with gr.Column():
415
  gr.Markdown(
416
+ "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
417
  )
418
  plot_2 = gr.Plot(p2, show_label=False)
419
  with gr.Row():
420
  with gr.Column():
421
  gr.Markdown(
422
+ "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
423
  )
424
  plot_3 = gr.Plot(p3, show_label=False)
425
  with gr.Column():
426
  gr.Markdown(
427
+ "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
428
  )
429
  plot_4 = gr.Plot(p4, show_label=False)
430
 
 
463
  line-height: 0.1em;
464
  }
465
 
466
+ #plot-title {
467
+ text-align: center;
468
+ display:block;
 
 
 
469
  }
470
 
471
+ #non-interactive-button {
472
+ display: inline-block;
473
+ padding: 10px 10px;
474
+ background-color: #f7f7f7; /* Super light grey background */
475
+ color: #000000; /* Black text */
476
+ text-align: center;
477
+ font-size: 26px; /* Larger text */
478
+ border-radius: 0; /* Straight edges, no border radius */
479
+ border: 0px solid #dcdcdc; /* A light grey border to match the background */
480
+ font-weight: bold;
481
+ user-select: none; /* The text inside the button is not selectable */
482
+ pointer-events: none; /* The button is non-interactive */
483
  }
484
 
485
  footer {
 
511
 
512
  def build_demo(elo_results_file, leaderboard_table_file):
513
  text_size = gr.themes.sizes.text_lg
514
+ theme = gr.themes.Base(text_size=text_size)
515
+ theme.set(button_secondary_background_fill_hover="*primary_300", button_secondary_background_fill_hover_dark="*primary_700")
516
  with gr.Blocks(
517
  title="Chatbot Arena Leaderboard",
518
+ theme=theme,
519
+ # theme = gr.themes.Base.load("theme.json"),
520
  css=block_css,
521
  ) as demo:
522
  leader_components = build_leaderboard_tab(