Corey Morris
commited on
Commit
•
a5fb364
1
Parent(s):
c10db67
Added introduction, links, and reduced the number of plots displayed
Browse files
app.py
CHANGED
@@ -5,7 +5,15 @@ from result_data_processor import ResultDataProcessor
|
|
5 |
|
6 |
data_provider = ResultDataProcessor()
|
7 |
|
8 |
-
st.title('Model Evaluation Results including MMLU by task')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
filters = st.checkbox('Select Models and Evaluations')
|
11 |
|
@@ -36,8 +44,6 @@ filtered_data = filtered_data.sort_values(by=['MMLU_average'], ascending=False)
|
|
36 |
st.dataframe(filtered_data[selected_columns])
|
37 |
|
38 |
# CSV download
|
39 |
-
# name the index to include in the csv download
|
40 |
-
|
41 |
|
42 |
filtered_data.index.name = "Model Name"
|
43 |
|
@@ -108,7 +114,7 @@ def create_plot(df, arc_column, moral_column, models=None):
|
|
108 |
# Custom scatter plots
|
109 |
st.header('Custom scatter plots')
|
110 |
selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
|
111 |
-
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=
|
112 |
|
113 |
if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
|
114 |
fig = create_plot(filtered_data, selected_x_column, selected_y_column)
|
@@ -118,42 +124,27 @@ else:
|
|
118 |
|
119 |
# end of custom scatter plots
|
120 |
|
121 |
-
st.header('
|
122 |
-
|
123 |
-
fig = create_plot(filtered_data, 'arc:challenge|25', 'hellaswag|10')
|
124 |
-
st.plotly_chart(fig)
|
125 |
-
|
126 |
-
fig = create_plot(filtered_data, 'arc:challenge|25', 'MMLU_average')
|
127 |
-
st.plotly_chart(fig)
|
128 |
-
|
129 |
-
fig = create_plot(filtered_data, 'hellaswag|10', 'MMLU_average')
|
130 |
-
st.plotly_chart(fig)
|
131 |
-
|
132 |
-
st.header('Top 50 models on MMLU_average')
|
133 |
-
top_50 = filtered_data.nlargest(50, 'MMLU_average')
|
134 |
-
fig = create_plot(top_50, 'arc:challenge|25', 'MMLU_average')
|
135 |
-
st.plotly_chart(fig)
|
136 |
-
|
137 |
-
st.header('Moral Reasoning')
|
138 |
-
|
139 |
-
fig = create_plot(filtered_data, 'arc:challenge|25', 'MMLU_moral_scenarios')
|
140 |
-
st.plotly_chart(fig)
|
141 |
|
142 |
-
fig = create_plot(filtered_data, '
|
143 |
st.plotly_chart(fig)
|
144 |
|
145 |
-
fig = create_plot(filtered_data, '
|
146 |
st.plotly_chart(fig)
|
147 |
|
148 |
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
|
149 |
st.plotly_chart(fig)
|
150 |
|
151 |
-
|
|
|
152 |
st.plotly_chart(fig)
|
153 |
|
|
|
|
|
154 |
|
155 |
-
st.markdown("**Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations**")
|
156 |
|
|
|
157 |
|
158 |
st.markdown("""
|
159 |
# References
|
|
|
5 |
|
6 |
data_provider = ResultDataProcessor()
|
7 |
|
8 |
+
# st.title('Model Evaluation Results including MMLU by task')
|
9 |
+
st.title('MMLU-by-Task Evaluation Results for 500+ Open Source Models')
|
10 |
+
st.markdown("""***Last updated August 7th***""")
|
11 |
+
st.markdown("""
|
12 |
+
Hugging Face has run evaluations on over 500 open source models and provides results on a
|
13 |
+
[publicly available leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [dataset](https://huggingface.co/datasets/open-llm-leaderboard/results).
|
14 |
+
The leaderboard currently displays the overall result for MMLU. This page shows individual accuracy scores for all 57 tasks of the MMLU evaluation.
|
15 |
+
[Preliminary analysis of MMLU-by-Task data](https://coreymorrisdata.medium.com/preliminary-analysis-of-mmlu-evaluation-data-insights-from-500-open-source-models-e67885aa364b)
|
16 |
+
""")
|
17 |
|
18 |
filters = st.checkbox('Select Models and Evaluations')
|
19 |
|
|
|
44 |
st.dataframe(filtered_data[selected_columns])
|
45 |
|
46 |
# CSV download
|
|
|
|
|
47 |
|
48 |
filtered_data.index.name = "Model Name"
|
49 |
|
|
|
114 |
# Custom scatter plots
|
115 |
st.header('Custom scatter plots')
|
116 |
selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
|
117 |
+
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
|
118 |
|
119 |
if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
|
120 |
fig = create_plot(filtered_data, selected_x_column, selected_y_column)
|
|
|
124 |
|
125 |
# end of custom scatter plots
|
126 |
|
127 |
+
st.header('Moral Scenarios Performance')
|
128 |
+
st.write("The dashed red line represents the random chance performance of 0.25")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
+
fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
|
131 |
st.plotly_chart(fig)
|
132 |
|
133 |
+
fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios')
|
134 |
st.plotly_chart(fig)
|
135 |
|
136 |
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
|
137 |
st.plotly_chart(fig)
|
138 |
|
139 |
+
st.header('Abstract Algebra Performance')
|
140 |
+
fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
|
141 |
st.plotly_chart(fig)
|
142 |
|
143 |
+
fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_abstract_algebra')
|
144 |
+
st.plotly_chart(fig)
|
145 |
|
|
|
146 |
|
147 |
+
st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
|
148 |
|
149 |
st.markdown("""
|
150 |
# References
|