Spaces:
Runtime error
Runtime error
justinxzhao
commited on
Commit
•
bfcc00c
1
Parent(s):
7ee6d4e
Nice touches.
Browse files
app.py
CHANGED
@@ -87,10 +87,17 @@ div.stButton > button {
|
|
87 |
|
88 |
st.markdown(full_width_button_css, unsafe_allow_html=True)
|
89 |
|
|
|
|
|
|
|
|
|
90 |
# Place a button in each column
|
91 |
with col1:
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
94 |
|
95 |
with col2:
|
96 |
if st.button("Paper"):
|
@@ -103,7 +110,7 @@ with col3:
|
|
103 |
# Custom CSS to center title and header
|
104 |
center_css = """
|
105 |
<style>
|
106 |
-
h1, h2{
|
107 |
text-align: center;
|
108 |
}
|
109 |
</style>
|
@@ -128,7 +135,22 @@ centered_image_html = f"""
|
|
128 |
st.markdown(centered_image_html, unsafe_allow_html=True)
|
129 |
|
130 |
st.title("Language Model Council")
|
131 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# Create horizontal tabs
|
134 |
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
@@ -137,6 +159,22 @@ tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
|
137 |
with tabs[0]:
|
138 |
st.dataframe(df_leaderboard)
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
with tabs[1]:
|
141 |
st.markdown("### 1. Select a scenario.")
|
142 |
# Create the selectors
|
@@ -152,7 +190,13 @@ with tabs[1]:
|
|
152 |
].iloc[0]
|
153 |
|
154 |
# Display the detailed dilemma and additional information
|
155 |
-
st.write(scenario_details["detailed_dilemma"])
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
with st.expander("Additional Information"):
|
157 |
st.write(f"**LLM Author:** {scenario_details['llm_author']}")
|
158 |
st.write(f"**Problem:** {scenario_details['problem']}")
|
@@ -180,7 +224,13 @@ with tabs[1]:
|
|
180 |
].iloc[0]
|
181 |
|
182 |
# Display the response string
|
183 |
-
st.write(response_details_fixed["response_string"])
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
with col2:
|
186 |
selected_model = st.selectbox(
|
@@ -195,7 +245,13 @@ with tabs[1]:
|
|
195 |
].iloc[0]
|
196 |
|
197 |
# Display the response string
|
198 |
-
st.write(response_details_dynamic["response_string"])
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
st.divider()
|
201 |
|
@@ -219,11 +275,9 @@ with tabs[1]:
|
|
219 |
(df_response_judging["first_completion_by"] == selected_model)
|
220 |
& (df_response_judging["second_completion_by"] == fixed_model)
|
221 |
]["pairwise_choice"].value_counts()
|
222 |
-
|
223 |
st.bar_chart(pairwise_counts_right)
|
224 |
|
225 |
# Create the llm_judge selector
|
226 |
-
# st.write("**Select an individual judge for detailed inpsection.**")
|
227 |
st.markdown("#### Individudal LLM judges")
|
228 |
selected_judge = st.selectbox(
|
229 |
"Select Judge", judge_options, label_visibility="hidden"
|
@@ -260,7 +314,15 @@ with tabs[1]:
|
|
260 |
st.write(
|
261 |
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
|
262 |
)
|
263 |
-
st.code(judging_details_left["judging_response_string"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
else:
|
265 |
st.write("No judging details found for the selected combination.")
|
266 |
|
@@ -270,21 +332,26 @@ with tabs[1]:
|
|
270 |
st.write(
|
271 |
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
|
272 |
)
|
273 |
-
st.code(judging_details_right["judging_response_string"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
else:
|
275 |
st.write("No judging details found for the selected combination.")
|
276 |
|
277 |
with tabs[2]:
|
278 |
-
st.write("This is the about us page.")
|
279 |
-
# Add your about us content here
|
280 |
st.write(
|
281 |
"""
|
282 |
-
|
283 |
-
To provide the best service and data insights.
|
284 |
|
285 |
**Our Team:**
|
286 |
-
-
|
287 |
-
-
|
288 |
-
-
|
289 |
"""
|
290 |
)
|
|
|
87 |
|
88 |
st.markdown(full_width_button_css, unsafe_allow_html=True)
|
89 |
|
90 |
+
# Create a button that triggers the JavaScript function
|
91 |
+
# if st.button(button_text):
|
92 |
+
# st.markdown('<script type="text/javascript">openUrl()</script>', unsafe_allow_html=True)
|
93 |
+
|
94 |
# Place a button in each column
|
95 |
with col1:
|
96 |
+
st.link_button(
|
97 |
+
"Data",
|
98 |
+
"https://huggingface.co/datasets/llm-council/emotional_application",
|
99 |
+
use_container_width=True,
|
100 |
+
)
|
101 |
|
102 |
with col2:
|
103 |
if st.button("Paper"):
|
|
|
110 |
# Custom CSS to center title and header
|
111 |
center_css = """
|
112 |
<style>
|
113 |
+
h1, h2, h6{
|
114 |
text-align: center;
|
115 |
}
|
116 |
</style>
|
|
|
135 |
st.markdown(centered_image_html, unsafe_allow_html=True)
|
136 |
|
137 |
st.title("Language Model Council")
|
138 |
+
st.markdown(
|
139 |
+
"###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus"
|
140 |
+
)
|
141 |
+
|
142 |
+
with st.expander("Abstract (abridged)"):
|
143 |
+
st.markdown(
|
144 |
+
"""Many tasks such as those related to emotional intelligence, creative writing, or persuasiveness, are highly subjective and often lack majoritarian agreement. To address the challenge of ranking LLMs on highly subjective tasks, we propose a novel benchmarking framework, the **Language Model Council (LMC)**. The LMC operates through a democratic process to:
|
145 |
+
|
146 |
+
1. Formulate a test set through equal participation.
|
147 |
+
2. Administer the test among council members.
|
148 |
+
3. Evaluate responses as a collective jury.
|
149 |
+
"""
|
150 |
+
)
|
151 |
+
st.markdown(
|
152 |
+
"This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
|
153 |
+
)
|
154 |
|
155 |
# Create horizontal tabs
|
156 |
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
|
|
159 |
with tabs[0]:
|
160 |
st.dataframe(df_leaderboard)
|
161 |
|
162 |
+
|
163 |
+
# HTML and CSS to create a text box with specified color
|
164 |
+
def colored_text_box(text, background_color, text_color="black"):
|
165 |
+
html_code = f"""
|
166 |
+
<div style="
|
167 |
+
background-color: {background_color};
|
168 |
+
color: {text_color};
|
169 |
+
padding: 10px;
|
170 |
+
border-radius: 5px;
|
171 |
+
">
|
172 |
+
{text}
|
173 |
+
</div>
|
174 |
+
"""
|
175 |
+
return html_code
|
176 |
+
|
177 |
+
|
178 |
with tabs[1]:
|
179 |
st.markdown("### 1. Select a scenario.")
|
180 |
# Create the selectors
|
|
|
190 |
].iloc[0]
|
191 |
|
192 |
# Display the detailed dilemma and additional information
|
193 |
+
# st.write(scenario_details["detailed_dilemma"])
|
194 |
+
st.markdown(
|
195 |
+
colored_text_box(
|
196 |
+
scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
|
197 |
+
),
|
198 |
+
unsafe_allow_html=True,
|
199 |
+
)
|
200 |
with st.expander("Additional Information"):
|
201 |
st.write(f"**LLM Author:** {scenario_details['llm_author']}")
|
202 |
st.write(f"**Problem:** {scenario_details['problem']}")
|
|
|
224 |
].iloc[0]
|
225 |
|
226 |
# Display the response string
|
227 |
+
# st.write(response_details_fixed["response_string"])
|
228 |
+
st.markdown(
|
229 |
+
colored_text_box(
|
230 |
+
response_details_fixed["response_string"], "#eeeeeeff", "black"
|
231 |
+
),
|
232 |
+
unsafe_allow_html=True,
|
233 |
+
)
|
234 |
|
235 |
with col2:
|
236 |
selected_model = st.selectbox(
|
|
|
245 |
].iloc[0]
|
246 |
|
247 |
# Display the response string
|
248 |
+
# st.write(response_details_dynamic["response_string"])
|
249 |
+
st.markdown(
|
250 |
+
colored_text_box(
|
251 |
+
response_details_dynamic["response_string"], "#eeeeeeff", "black"
|
252 |
+
),
|
253 |
+
unsafe_allow_html=True,
|
254 |
+
)
|
255 |
|
256 |
st.divider()
|
257 |
|
|
|
275 |
(df_response_judging["first_completion_by"] == selected_model)
|
276 |
& (df_response_judging["second_completion_by"] == fixed_model)
|
277 |
]["pairwise_choice"].value_counts()
|
|
|
278 |
st.bar_chart(pairwise_counts_right)
|
279 |
|
280 |
# Create the llm_judge selector
|
|
|
281 |
st.markdown("#### Individudal LLM judges")
|
282 |
selected_judge = st.selectbox(
|
283 |
"Select Judge", judge_options, label_visibility="hidden"
|
|
|
314 |
st.write(
|
315 |
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
|
316 |
)
|
317 |
+
# st.code(judging_details_left["judging_response_string"])
|
318 |
+
st.markdown(
|
319 |
+
colored_text_box(
|
320 |
+
judging_details_left["judging_response_string"],
|
321 |
+
"#eeeeeeff",
|
322 |
+
"black",
|
323 |
+
),
|
324 |
+
unsafe_allow_html=True,
|
325 |
+
)
|
326 |
else:
|
327 |
st.write("No judging details found for the selected combination.")
|
328 |
|
|
|
332 |
st.write(
|
333 |
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
|
334 |
)
|
335 |
+
# st.code(judging_details_right["judging_response_string"])
|
336 |
+
st.markdown(
|
337 |
+
colored_text_box(
|
338 |
+
judging_details_right["judging_response_string"],
|
339 |
+
"#eeeeeeff",
|
340 |
+
"black",
|
341 |
+
),
|
342 |
+
unsafe_allow_html=True,
|
343 |
+
)
|
344 |
else:
|
345 |
st.write("No judging details found for the selected combination.")
|
346 |
|
347 |
with tabs[2]:
|
|
|
|
|
348 |
st.write(
|
349 |
"""
|
350 |
+
Please reach out if you are interested in collaborating!
|
|
|
351 |
|
352 |
**Our Team:**
|
353 |
+
- Justin Zhao ([email protected])
|
354 |
+
- Flor Plaza ([email protected])
|
355 |
+
- Amanda Cercas Curry ([email protected])
|
356 |
"""
|
357 |
)
|