justinxzhao commited on
Commit
a129336
1 Parent(s): ae3759c

Add analysis graphs, and add color coding to interpersonal conflicts data samples.

Browse files
app.py CHANGED
@@ -51,14 +51,6 @@ def pil_to_base64(img):
51
  return img_str
52
 
53
 
54
- # Function to convert PIL image to base64
55
- def pil_svg_to_base64(img):
56
- buffered = BytesIO()
57
- img.save(buffered, format="SVG")
58
- img_str = base64.b64encode(buffered.getvalue()).decode()
59
- return img_str
60
-
61
-
62
  # Load your dataframes
63
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
64
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
@@ -82,7 +74,7 @@ model_options = df_responses["llm_responder"].unique().tolist()
82
  # Prepare the judge selector options
83
  judge_options = df_response_judging["llm_judge"].unique().tolist()
84
 
85
- st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")
86
 
87
  # Create three columns
88
  col1, col2, col3 = st.columns(3)
@@ -142,7 +134,7 @@ st.markdown(center_css, unsafe_allow_html=True)
142
  # st.markdown(centered_image_html, unsafe_allow_html=True)
143
 
144
  # Title and subtitle.
145
- st.title("Language Model Council")
146
  st.markdown(
147
  "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
148
  )
@@ -179,11 +171,19 @@ st.markdown(
179
  )
180
 
181
  # Create horizontal tabs
182
- tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
 
 
 
 
 
 
 
183
 
184
  # Define content for each tab
185
  with tabs[0]:
186
- st.dataframe(df_leaderboard)
 
187
 
188
 
189
  # HTML and CSS to create a text box with specified color
@@ -193,7 +193,7 @@ def colored_text_box(text, background_color, text_color="black"):
193
  background-color: {background_color};
194
  color: {text_color};
195
  padding: 10px;
196
- border-radius: 5px;
197
  ">
198
  {text}
199
  </div>
@@ -263,15 +263,21 @@ with tabs[1]:
263
  # Display the detailed dilemma and additional information
264
  st.markdown(
265
  colored_text_box(
266
- scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
 
 
267
  ),
268
  unsafe_allow_html=True,
269
  )
270
  with st.expander("Additional Information"):
271
- st.write(f"**LLM Author:** {scenario_details['llm_author']}")
272
- st.write(f"**Problem:** {scenario_details['problem']}")
273
- st.write(f"**Relationship:** {scenario_details['relationship']}")
274
- st.write(f"**Scenario:** {scenario_details['scenario']}")
 
 
 
 
275
 
276
  st.divider()
277
 
@@ -296,7 +302,9 @@ with tabs[1]:
296
  # Display the response string
297
  st.markdown(
298
  colored_text_box(
299
- response_details_fixed["response_string"], "#eeeeeeff", "black"
 
 
300
  ),
301
  unsafe_allow_html=True,
302
  )
@@ -324,7 +332,9 @@ with tabs[1]:
324
  # Display the response string
325
  st.markdown(
326
  colored_text_box(
327
- response_details_dynamic["response_string"], "#eeeeeeff", "black"
 
 
328
  ),
329
  unsafe_allow_html=True,
330
  )
@@ -414,7 +424,7 @@ with tabs[1]:
414
  st.markdown(
415
  colored_text_box(
416
  judging_details_left["judging_response_string"],
417
- "#eeeeeeff",
418
  "black",
419
  ),
420
  unsafe_allow_html=True,
@@ -430,7 +440,7 @@ with tabs[1]:
430
  st.markdown(
431
  colored_text_box(
432
  judging_details_right["judging_response_string"],
433
- "#eeeeeeff",
434
  "black",
435
  ),
436
  unsafe_allow_html=True,
@@ -439,6 +449,54 @@ with tabs[1]:
439
  st.write("No judging details found for the selected combination.")
440
 
441
  with tabs[2]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  st.write(
443
  """
444
  Please reach out if you are interested in collaborating!
 
51
  return img_str
52
 
53
 
 
 
 
 
 
 
 
 
54
  # Load your dataframes
55
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
56
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
 
74
  # Prepare the judge selector options
75
  judge_options = df_response_judging["llm_judge"].unique().tolist()
76
 
77
+ st.set_page_config(page_title="Language Model Council", page_icon="🏛️", layout="wide")
78
 
79
  # Create three columns
80
  col1, col2, col3 = st.columns(3)
 
134
  # st.markdown(centered_image_html, unsafe_allow_html=True)
135
 
136
  # Title and subtitle.
137
+ st.title("🗳️ Language Model Council")
138
  st.markdown(
139
  "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
140
  )
 
171
  )
172
 
173
  # Create horizontal tabs
174
+ tabs = st.tabs(
175
+ [
176
+ "Leaderboard Results",
177
+ "Interpersonal Conflicts",
178
+ "Analysis",
179
+ "About Us",
180
+ ]
181
+ )
182
 
183
  # Define content for each tab
184
  with tabs[0]:
185
+ _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
186
+ mid_column.dataframe(df_leaderboard)
187
 
188
 
189
  # HTML and CSS to create a text box with specified color
 
193
  background-color: {background_color};
194
  color: {text_color};
195
  padding: 10px;
196
+ border-radius: 10px;
197
  ">
198
  {text}
199
  </div>
 
263
  # Display the detailed dilemma and additional information
264
  st.markdown(
265
  colored_text_box(
266
+ scenario_details["detailed_dilemma"],
267
+ "#01204E",
268
+ "white",
269
  ),
270
  unsafe_allow_html=True,
271
  )
272
  with st.expander("Additional Information"):
273
+ st.write(
274
+ {
275
+ "LLM Author": scenario_details["llm_author"],
276
+ "Problem": scenario_details["problem"],
277
+ "Relationship": scenario_details["relationship"],
278
+ "Scenario": scenario_details["scenario"],
279
+ }
280
+ )
281
 
282
  st.divider()
283
 
 
302
  # Display the response string
303
  st.markdown(
304
  colored_text_box(
305
+ response_details_fixed["response_string"],
306
+ "#028391",
307
+ "white",
308
  ),
309
  unsafe_allow_html=True,
310
  )
 
332
  # Display the response string
333
  st.markdown(
334
  colored_text_box(
335
+ response_details_dynamic["response_string"],
336
+ "#028391",
337
+ "white",
338
  ),
339
  unsafe_allow_html=True,
340
  )
 
424
  st.markdown(
425
  colored_text_box(
426
  judging_details_left["judging_response_string"],
427
+ "#FEAE6F",
428
  "black",
429
  ),
430
  unsafe_allow_html=True,
 
440
  st.markdown(
441
  colored_text_box(
442
  judging_details_right["judging_response_string"],
443
+ "#FEAE6F",
444
  "black",
445
  ),
446
  unsafe_allow_html=True,
 
449
  st.write("No judging details found for the selected combination.")
450
 
451
  with tabs[2]:
452
+ st.markdown("### Battles (Respondent vs. Respondent)")
453
+ st.write("Expected win rates based on Terry-Bradley coefficients:")
454
+ image = Image.open("img/llm_vs_llm_win_rates.png")
455
+ img_base64 = pil_to_base64(image)
456
+ centered_image_html = f"""
457
+ <div style="text-align: center;">
458
+ <img src="data:image/png;base64,{img_base64}" width="1000"/>
459
+ </div>
460
+ """
461
+ st.markdown(centered_image_html, unsafe_allow_html=True)
462
+
463
+ st.markdown("### Affinities (Judge vs. Respondent)")
464
+
465
+ st.write("Raw affinities:")
466
+ image = Image.open("img/raw.png")
467
+ img_base64 = pil_to_base64(image)
468
+ centered_image_html = f"""
469
+ <div style="text-align: center;">
470
+ <img src="data:image/png;base64,{img_base64}" width="1000"/>
471
+ </div>
472
+ """
473
+ st.markdown(centered_image_html, unsafe_allow_html=True)
474
+
475
+ st.write("Council-Normalized:")
476
+ image = Image.open("img/council_normalized.png")
477
+ img_base64 = pil_to_base64(image)
478
+ centered_image_html = f"""
479
+ <div style="text-align: center;">
480
+ <img src="data:image/png;base64,{img_base64}" width="1000"/>
481
+ </div>
482
+ """
483
+ st.markdown(centered_image_html, unsafe_allow_html=True)
484
+
485
+ st.markdown("### Agreement (Judge vs. Judge)")
486
+
487
+ st.write("Sidewise Cohen's Kappa:")
488
+ image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png")
489
+ img_base64 = pil_to_base64(image)
490
+ centered_image_html = f"""
491
+ <div style="text-align: center;">
492
+ <img src="data:image/png;base64,{img_base64}" width="1000"/>
493
+ </div>
494
+ """
495
+ st.markdown(centered_image_html, unsafe_allow_html=True)
496
+
497
+ st.write("Check out the paper for more detailed analysis!")
498
+
499
+ with tabs[-1]:
500
  st.write(
501
  """
502
  Please reach out if you are interested in collaborating!
img/council_normalized.png ADDED
img/judge_agreement.sidewise_cohen_kappa.png ADDED
img/llm_vs_llm_win_rates.png ADDED
img/raw.png ADDED