justinxzhao commited on
Commit
29e2769
1 Parent(s): bfcc00c

Add index.html shim, and add hero.svg

Browse files
Files changed (4) hide show
  1. app.py +150 -56
  2. img/hero.png +0 -0
  3. img/hero.svg +0 -0
  4. index.html +7 -0
app.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
  from PIL import Image
4
  import base64
5
  from io import BytesIO
 
6
 
7
  # Define constants
8
  MAJOR_A_WIN = "A>>B"
@@ -50,6 +51,14 @@ def pil_to_base64(img):
50
  return img_str
51
 
52
 
 
 
 
 
 
 
 
 
53
  # Load your dataframes
54
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
55
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
@@ -57,7 +66,9 @@ df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
57
  df_leaderboard = (
58
  pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
59
  )
60
- df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})
 
 
61
 
62
  # Prepare the scenario selector options
63
  df_test_set["scenario_option"] = (
@@ -84,7 +95,6 @@ div.stButton > button {
84
  }
85
  </style>
86
  """
87
-
88
  st.markdown(full_width_button_css, unsafe_allow_html=True)
89
 
90
  # Create a button that triggers the JavaScript function
@@ -104,8 +114,11 @@ with col2:
104
  st.write("Button 2 clicked")
105
 
106
  with col3:
107
- if st.button("Github"):
108
- st.write("Button 3 clicked")
 
 
 
109
 
110
  # Custom CSS to center title and header
111
  center_css = """
@@ -118,35 +131,48 @@ h1, h2, h6{
118
 
119
  st.markdown(center_css, unsafe_allow_html=True)
120
 
121
- # Load an image
122
- image = Image.open("img/lmc_icon.png")
123
-
124
- # Convert the image to base64
125
- img_base64 = pil_to_base64(image)
126
-
127
- # HTML to center the image and embed base64 image
128
- centered_image_html = f"""
129
- <div style="text-align: center;">
130
- <img src="data:image/png;base64,{img_base64}" width="50"/>
131
- </div>
132
- """
133
-
134
- # Rendering the centered image
135
- st.markdown(centered_image_html, unsafe_allow_html=True)
136
-
137
  st.title("Language Model Council")
138
  st.markdown(
139
- "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus"
140
  )
141
 
142
- with st.expander("Abstract (abridged)"):
143
- st.markdown(
144
- """Many tasks such as those related to emotional intelligence, creative writing, or persuasiveness, are highly subjective and often lack majoritarian agreement. To address the challenge of ranking LLMs on highly subjective tasks, we propose a novel benchmarking framework, the **Language Model Council (LMC)**. The LMC operates through a democratic process to:
145
 
146
- 1. Formulate a test set through equal participation.
147
- 2. Administer the test among council members.
148
- 3. Evaluate responses as a collective jury.
149
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  )
151
  st.markdown(
152
  "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
@@ -175,22 +201,66 @@ def colored_text_box(text, background_color, text_color="black"):
175
  return html_code
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  with tabs[1]:
 
 
 
 
 
 
179
  st.markdown("### 1. Select a scenario.")
180
  # Create the selectors
181
- selected_scenario = st.selectbox(
182
- "Select Scenario", scenario_options, label_visibility="hidden"
 
 
 
 
 
 
 
 
 
183
  )
184
 
185
  # Get the selected scenario details
186
- if selected_scenario:
187
- selected_emobench_id = int(selected_scenario.split(": ")[0])
188
  scenario_details = df_test_set[
189
  df_test_set["emobench_id"] == selected_emobench_id
190
  ].iloc[0]
191
 
192
  # Display the detailed dilemma and additional information
193
- # st.write(scenario_details["detailed_dilemma"])
194
  st.markdown(
195
  colored_text_box(
196
  scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
@@ -217,14 +287,13 @@ with tabs[1]:
217
  )
218
 
219
  # Get the response string for the fixed model
220
- if selected_scenario:
221
  response_details_fixed = df_responses[
222
  (df_responses["emobench_id"] == selected_emobench_id)
223
  & (df_responses["llm_responder"] == fixed_model)
224
  ].iloc[0]
225
 
226
  # Display the response string
227
- # st.write(response_details_fixed["response_string"])
228
  st.markdown(
229
  colored_text_box(
230
  response_details_fixed["response_string"], "#eeeeeeff", "black"
@@ -233,19 +302,26 @@ with tabs[1]:
233
  )
234
 
235
  with col2:
236
- selected_model = st.selectbox(
237
- "Select Model", model_options, key="dynamic_model"
 
 
 
 
 
 
 
 
238
  )
239
 
240
  # Get the response string for the selected model
241
- if selected_model and selected_scenario:
242
  response_details_dynamic = df_responses[
243
  (df_responses["emobench_id"] == selected_emobench_id)
244
- & (df_responses["llm_responder"] == selected_model)
245
  ].iloc[0]
246
 
247
  # Display the response string
248
- # st.write(response_details_dynamic["response_string"])
249
  st.markdown(
250
  colored_text_box(
251
  response_details_dynamic["response_string"], "#eeeeeeff", "black"
@@ -262,43 +338,65 @@ with tabs[1]:
262
  col1, col2 = st.columns(2)
263
 
264
  with col1:
265
- st.write(f"**{fixed_model}** vs **{selected_model}**")
266
  pairwise_counts_left = df_response_judging[
267
  (df_response_judging["first_completion_by"] == fixed_model)
268
- & (df_response_judging["second_completion_by"] == selected_model)
 
 
 
269
  ]["pairwise_choice"].value_counts()
270
  st.bar_chart(pairwise_counts_left)
271
 
272
  with col2:
273
- st.write(f"**{selected_model}** vs **{fixed_model}**")
274
  pairwise_counts_right = df_response_judging[
275
- (df_response_judging["first_completion_by"] == selected_model)
 
 
 
276
  & (df_response_judging["second_completion_by"] == fixed_model)
277
  ]["pairwise_choice"].value_counts()
278
  st.bar_chart(pairwise_counts_right)
279
 
280
  # Create the llm_judge selector
281
- st.markdown("#### Individudal LLM judges")
282
- selected_judge = st.selectbox(
283
- "Select Judge", judge_options, label_visibility="hidden"
 
 
 
 
 
 
 
 
 
284
  )
285
 
286
  # Get the judging details for the selected judge and models
287
- if selected_judge and selected_scenario:
288
  col1, col2 = st.columns(2)
289
 
290
  judging_details_left = df_response_judging[
291
- (df_response_judging["llm_judge"] == selected_judge)
292
  & (df_response_judging["first_completion_by"] == fixed_model)
293
- & (df_response_judging["second_completion_by"] == selected_model)
 
 
 
294
  ].iloc[0]
295
 
296
  judging_details_right = df_response_judging[
297
- (df_response_judging["llm_judge"] == selected_judge)
298
- & (df_response_judging["first_completion_by"] == selected_model)
 
 
 
299
  & (df_response_judging["second_completion_by"] == fixed_model)
300
  ].iloc[0]
301
 
 
302
  if is_consistent(
303
  judging_details_left["pairwise_choice"],
304
  judging_details_right["pairwise_choice"],
@@ -309,12 +407,10 @@ with tabs[1]:
309
 
310
  # Display the judging details
311
  with col1:
312
- # st.write(f"**{fixed_model}** vs **{selected_model}**")
313
  if not judging_details_left.empty:
314
  st.write(
315
  f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
316
  )
317
- # st.code(judging_details_left["judging_response_string"])
318
  st.markdown(
319
  colored_text_box(
320
  judging_details_left["judging_response_string"],
@@ -327,12 +423,10 @@ with tabs[1]:
327
  st.write("No judging details found for the selected combination.")
328
 
329
  with col2:
330
- # st.write(f"**{selected_model}** vs **{fixed_model}**")
331
  if not judging_details_right.empty:
332
  st.write(
333
  f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
334
  )
335
- # st.code(judging_details_right["judging_response_string"])
336
  st.markdown(
337
  colored_text_box(
338
  judging_details_right["judging_response_string"],
 
3
  from PIL import Image
4
  import base64
5
  from io import BytesIO
6
+ import random
7
 
8
  # Define constants
9
  MAJOR_A_WIN = "A>>B"
 
51
  return img_str
52
 
53
 
54
+ # Function to convert PIL image to base64
55
+ def pil_svg_to_base64(img):
56
+ buffered = BytesIO()
57
+ img.save(buffered, format="SVG")
58
+ img_str = base64.b64encode(buffered.getvalue()).decode()
59
+ return img_str
60
+
61
+
62
  # Load your dataframes
63
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
64
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
 
66
  df_leaderboard = (
67
  pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
68
  )
69
+ df_leaderboard = df_leaderboard.rename(
70
+ columns={"EI Score": "Council Arena EI Score (95% CI)"}
71
+ )
72
 
73
  # Prepare the scenario selector options
74
  df_test_set["scenario_option"] = (
 
95
  }
96
  </style>
97
  """
 
98
  st.markdown(full_width_button_css, unsafe_allow_html=True)
99
 
100
  # Create a button that triggers the JavaScript function
 
114
  st.write("Button 2 clicked")
115
 
116
  with col3:
117
+ st.link_button(
118
+ "Github",
119
+ "https://github.com/llm-council/llm-council",
120
+ use_container_width=True,
121
+ )
122
 
123
  # Custom CSS to center title and header
124
  center_css = """
 
131
 
132
  st.markdown(center_css, unsafe_allow_html=True)
133
 
134
+ # Centered icon.
135
+ # image = Image.open("img/lmc_icon.png")
136
+ # img_base64 = pil_to_base64(image)
137
+ # centered_image_html = f"""
138
+ # <div style="text-align: center;">
139
+ # <img src="data:image/png;base64,{img_base64}" width="50"/>
140
+ # </div>
141
+ # """
142
+ # st.markdown(centered_image_html, unsafe_allow_html=True)
143
+
144
+ # Title and subtitle.
 
 
 
 
 
145
  st.title("Language Model Council")
146
  st.markdown(
147
+ "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
148
  )
149
 
150
+ # Render hero image.
151
+ with open("img/hero.svg", "r") as file:
152
+ svg_content = file.read()
153
 
154
+ left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2])
155
+ with cent_co:
156
+ st.image(svg_content, use_column_width=True)
157
+
158
+
159
+ with cent_co.expander("Abstract"):
160
+ st.markdown(
161
+ """The rapid advancement of Large Language Models (LLMs) necessitates robust
162
+ and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based
163
+ on how well their responses align with human preferences. However, many tasks
164
+ such as those related to emotional intelligence, creative writing, or persuasiveness,
165
+ are highly subjective and often lack majoritarian human agreement. Judges may
166
+ have irreconcilable disagreements about what constitutes a better response. To
167
+ address the challenge of ranking LLMs on highly subjective tasks, we propose
168
+ a novel benchmarking framework, the Language Model Council (LMC). The
169
+ LMC operates through a democratic process to: 1) formulate a test set through
170
+ equal participation, 2) administer the test among council members, and 3) evaluate
171
+ responses as a collective jury. We deploy a council of 20 newest LLMs on an
172
+ open-ended emotional intelligence task: responding to interpersonal dilemmas.
173
+ Our results show that the LMC produces rankings that are more separable, robust,
174
+ and less biased than those from any individual LLM judge, and is more consistent
175
+ with a human-established leaderboard compared to other benchmarks."""
176
  )
177
  st.markdown(
178
  "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
 
201
  return html_code
202
 
203
 
204
+ # Ensure to initialize session state variables if they do not exist
205
+ if "selected_scenario" not in st.session_state:
206
+ st.session_state.selected_scenario = None
207
+
208
+ if "selected_model" not in st.session_state:
209
+ st.session_state.selected_model = None
210
+
211
+ if "selected_judge" not in st.session_state:
212
+ st.session_state.selected_judge = None
213
+
214
+
215
+ # Define callback functions to update session state
216
+ def update_scenario():
217
+ st.session_state.selected_scenario = st.session_state.scenario_selector
218
+
219
+
220
+ def update_model():
221
+ st.session_state.selected_model = st.session_state.model_selector
222
+
223
+
224
+ def update_judge():
225
+ st.session_state.selected_judge = st.session_state.judge_selector
226
+
227
+
228
+ def randomize_selection():
229
+ st.session_state.selected_scenario = random.choice(scenario_options)
230
+ st.session_state.selected_model = random.choice(model_options)
231
+ st.session_state.selected_judge = random.choice(judge_options)
232
+
233
+
234
  with tabs[1]:
235
+ # Add randomize button at the top of the app
236
+ _, mid_column, _ = st.columns([0.4, 0.2, 0.4])
237
+ mid_column.button(
238
+ ":game_die: Randomize!", on_click=randomize_selection, type="primary"
239
+ )
240
+
241
  st.markdown("### 1. Select a scenario.")
242
  # Create the selectors
243
+ st.session_state.selected_scenario = st.selectbox(
244
+ "Select Scenario",
245
+ scenario_options,
246
+ label_visibility="hidden",
247
+ key="scenario_selector",
248
+ on_change=update_scenario,
249
+ index=(
250
+ scenario_options.index(st.session_state.selected_scenario)
251
+ if st.session_state.selected_scenario
252
+ else 0
253
+ ),
254
  )
255
 
256
  # Get the selected scenario details
257
+ if st.session_state.selected_scenario:
258
+ selected_emobench_id = int(st.session_state.selected_scenario.split(": ")[0])
259
  scenario_details = df_test_set[
260
  df_test_set["emobench_id"] == selected_emobench_id
261
  ].iloc[0]
262
 
263
  # Display the detailed dilemma and additional information
 
264
  st.markdown(
265
  colored_text_box(
266
  scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
 
287
  )
288
 
289
  # Get the response string for the fixed model
290
+ if st.session_state.selected_scenario:
291
  response_details_fixed = df_responses[
292
  (df_responses["emobench_id"] == selected_emobench_id)
293
  & (df_responses["llm_responder"] == fixed_model)
294
  ].iloc[0]
295
 
296
  # Display the response string
 
297
  st.markdown(
298
  colored_text_box(
299
  response_details_fixed["response_string"], "#eeeeeeff", "black"
 
302
  )
303
 
304
  with col2:
305
+ st.session_state.selected_model = st.selectbox(
306
+ "Select Model",
307
+ model_options,
308
+ key="model_selector",
309
+ on_change=update_model,
310
+ index=(
311
+ model_options.index(st.session_state.selected_model)
312
+ if st.session_state.selected_model
313
+ else 0
314
+ ),
315
  )
316
 
317
  # Get the response string for the selected model
318
+ if st.session_state.selected_model and st.session_state.selected_scenario:
319
  response_details_dynamic = df_responses[
320
  (df_responses["emobench_id"] == selected_emobench_id)
321
+ & (df_responses["llm_responder"] == st.session_state.selected_model)
322
  ].iloc[0]
323
 
324
  # Display the response string
 
325
  st.markdown(
326
  colored_text_box(
327
  response_details_dynamic["response_string"], "#eeeeeeff", "black"
 
338
  col1, col2 = st.columns(2)
339
 
340
  with col1:
341
+ st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**")
342
  pairwise_counts_left = df_response_judging[
343
  (df_response_judging["first_completion_by"] == fixed_model)
344
+ & (
345
+ df_response_judging["second_completion_by"]
346
+ == st.session_state.selected_model
347
+ )
348
  ]["pairwise_choice"].value_counts()
349
  st.bar_chart(pairwise_counts_left)
350
 
351
  with col2:
352
+ st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**")
353
  pairwise_counts_right = df_response_judging[
354
+ (
355
+ df_response_judging["first_completion_by"]
356
+ == st.session_state.selected_model
357
+ )
358
  & (df_response_judging["second_completion_by"] == fixed_model)
359
  ]["pairwise_choice"].value_counts()
360
  st.bar_chart(pairwise_counts_right)
361
 
362
  # Create the llm_judge selector
363
+ st.markdown("#### Individual LLM judges")
364
+ st.session_state.selected_judge = st.selectbox(
365
+ "Select Judge",
366
+ judge_options,
367
+ label_visibility="hidden",
368
+ key="judge_selector",
369
+ on_change=update_judge,
370
+ index=(
371
+ judge_options.index(st.session_state.selected_judge)
372
+ if st.session_state.selected_judge
373
+ else 0
374
+ ),
375
  )
376
 
377
  # Get the judging details for the selected judge and models
378
+ if st.session_state.selected_judge and st.session_state.selected_scenario:
379
  col1, col2 = st.columns(2)
380
 
381
  judging_details_left = df_response_judging[
382
+ (df_response_judging["llm_judge"] == st.session_state.selected_judge)
383
  & (df_response_judging["first_completion_by"] == fixed_model)
384
+ & (
385
+ df_response_judging["second_completion_by"]
386
+ == st.session_state.selected_model
387
+ )
388
  ].iloc[0]
389
 
390
  judging_details_right = df_response_judging[
391
+ (df_response_judging["llm_judge"] == st.session_state.selected_judge)
392
+ & (
393
+ df_response_judging["first_completion_by"]
394
+ == st.session_state.selected_model
395
+ )
396
  & (df_response_judging["second_completion_by"] == fixed_model)
397
  ].iloc[0]
398
 
399
+ # Render consistency.
400
  if is_consistent(
401
  judging_details_left["pairwise_choice"],
402
  judging_details_right["pairwise_choice"],
 
407
 
408
  # Display the judging details
409
  with col1:
 
410
  if not judging_details_left.empty:
411
  st.write(
412
  f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
413
  )
 
414
  st.markdown(
415
  colored_text_box(
416
  judging_details_left["judging_response_string"],
 
423
  st.write("No judging details found for the selected combination.")
424
 
425
  with col2:
 
426
  if not judging_details_right.empty:
427
  st.write(
428
  f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
429
  )
 
430
  st.markdown(
431
  colored_text_box(
432
  judging_details_right["judging_response_string"],
img/hero.png ADDED
img/hero.svg ADDED
index.html ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <iframe
2
+ id="your-iframe-id"
3
+ src="https://llm-council-emotional-intelligence-arena.hf.space"
4
+ frameborder="0"
5
+ width="100%"
6
+ height="100%"
7
+ ></iframe>