Spaces:
Running
Running
JaydenCool
commited on
Commit
•
d7041cd
1
Parent(s):
fd065ec
update files
Browse files- README.md +4 -4
- app.py +437 -0
- assets/logo.png +0 -0
- assets/text.py +68 -0
- changelog.md +8 -0
- data/chinese_benchmark_gen.csv +27 -0
- data/chinese_benchmark_per.csv +23 -0
- data/subclass_gen.csv +23 -0
- data/subclass_per.csv +23 -0
- requirements.txt +65 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: ChineseSafe
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.38.1
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: ChineseSafe
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.38.1
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
7 |
+
|
8 |
+
|
9 |
+
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
|
10 |
+
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
|
11 |
+
|
12 |
+
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
|
13 |
+
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
|
14 |
+
|
15 |
+
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
16 |
+
|
17 |
+
|
18 |
+
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
19 |
+
|
20 |
+
#SPLITS = ["Overall", "Subclass"]
|
21 |
+
SPLITS = ["Overall", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
22 |
+
|
23 |
+
CLASSIFICATION = {
|
24 |
+
"model_size": [
|
25 |
+
">65B",
|
26 |
+
"~30B",
|
27 |
+
"10B~20B",
|
28 |
+
"5B~10B",
|
29 |
+
"API",
|
30 |
+
]
|
31 |
+
|
32 |
+
}
|
33 |
+
|
34 |
+
|
35 |
+
_BIBTEX = """ Waiting for paper ... """
|
36 |
+
|
37 |
+
_LAST_UPDATED = "July 21, 2024"
|
38 |
+
|
39 |
+
banner_url = "./assets/logo.png"
|
40 |
+
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def retrieve_array_from_text(text):
|
46 |
+
return np.fromstring(text.replace("[", "").replace("]", ""), dtype=float, sep=",")
|
47 |
+
|
48 |
+
def format_csv_numbers(text):
|
49 |
+
return text.split('/')[0]
|
50 |
+
|
51 |
+
def format_csv_numbers_second(text):
|
52 |
+
return text.split()
|
53 |
+
|
54 |
+
|
55 |
+
def format_number(x):
|
56 |
+
return float(f"{x:.3}")
|
57 |
+
|
58 |
+
|
59 |
+
def get_dataset_csv(
|
60 |
+
model_size: List[str],
|
61 |
+
):
|
62 |
+
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
63 |
+
df = df.drop(columns="Size")
|
64 |
+
|
65 |
+
# if metric_choice != "None":
|
66 |
+
# metric_choice = metric_choice + "/std"
|
67 |
+
# sort_basis = df[metric_choice].apply(format_csv_numbers)
|
68 |
+
# sorted_indices = sort_basis.argsort()[::-1]
|
69 |
+
# df = df.iloc[sorted_indices]
|
70 |
+
|
71 |
+
leaderboard_table = gr.components.Dataframe(
|
72 |
+
value=df,
|
73 |
+
interactive=False,
|
74 |
+
visible=True,
|
75 |
+
)
|
76 |
+
return leaderboard_table
|
77 |
+
|
78 |
+
def get_dataset_csv_per(
|
79 |
+
model_size: List[str],
|
80 |
+
):
|
81 |
+
df = ORIGINAL_DF_PER[ORIGINAL_DF_PER['Size'].isin(model_size)]
|
82 |
+
df = df.drop(columns="Size")
|
83 |
+
|
84 |
+
# if metric_choice != "None":
|
85 |
+
# metric_choice = metric_choice + "/std"
|
86 |
+
# sort_basis = df[metric_choice].apply(format_csv_numbers)
|
87 |
+
# sorted_indices = sort_basis.argsort()[::-1]
|
88 |
+
# df = df.iloc[sorted_indices]
|
89 |
+
|
90 |
+
leaderboard_table = gr.components.Dataframe(
|
91 |
+
value=df,
|
92 |
+
interactive=False,
|
93 |
+
visible=True,
|
94 |
+
)
|
95 |
+
return leaderboard_table
|
96 |
+
|
97 |
+
# this is a sub function for csv table
|
98 |
+
def get_dataset_csv_sub_gen(
|
99 |
+
model_size: List[str],
|
100 |
+
subclass_choice: List[str],
|
101 |
+
):
|
102 |
+
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
103 |
+
df = df.drop(columns="Size")
|
104 |
+
|
105 |
+
# get subclass
|
106 |
+
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
107 |
+
df = df[subclass_choice_label]
|
108 |
+
|
109 |
+
# if metric_choice != "None":
|
110 |
+
# # metric_choice = metric_choice + "/std"
|
111 |
+
# metric_choice = metric_choice.split("_")[0]
|
112 |
+
# metric_choice = subclass_choice + "_" + metric_choice
|
113 |
+
# # sort_basis = df[metric_choice].apply(format_csv_numbers)
|
114 |
+
# sort_basis = df[metric_choice]
|
115 |
+
|
116 |
+
# sorted_indices = sort_basis.argsort()[::-1]
|
117 |
+
# df = df.iloc[sorted_indices]
|
118 |
+
|
119 |
+
leaderboard_table = gr.components.Dataframe(
|
120 |
+
value=df,
|
121 |
+
interactive=False,
|
122 |
+
visible=True,
|
123 |
+
)
|
124 |
+
return leaderboard_table
|
125 |
+
|
126 |
+
# this is a sub function for csv table
|
127 |
+
def get_dataset_csv_sub_per(
|
128 |
+
model_size: List[str],
|
129 |
+
subclass_choice: List[str],
|
130 |
+
):
|
131 |
+
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
132 |
+
df = df.drop(columns="Size")
|
133 |
+
|
134 |
+
# get subclass
|
135 |
+
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
136 |
+
df = df[subclass_choice_label]
|
137 |
+
|
138 |
+
# if metric_choice != "None":
|
139 |
+
# # metric_choice = metric_choice + "/std"
|
140 |
+
# metric_choice = metric_choice.split("_")[0]
|
141 |
+
# metric_choice = subclass_choice + "_" + metric_choice
|
142 |
+
# # sort_basis = df[metric_choice].apply(format_csv_numbers)
|
143 |
+
# sort_basis = df[metric_choice]
|
144 |
+
|
145 |
+
# sorted_indices = sort_basis.argsort()[::-1]
|
146 |
+
# df = df.iloc[sorted_indices]
|
147 |
+
|
148 |
+
leaderboard_table = gr.components.Dataframe(
|
149 |
+
value=df,
|
150 |
+
interactive=False,
|
151 |
+
visible=True,
|
152 |
+
)
|
153 |
+
return leaderboard_table
|
154 |
+
|
155 |
+
|
156 |
+
def get_dataset_classfier_gen(
|
157 |
+
model_size: List[str],
|
158 |
+
main_choice: List[str],
|
159 |
+
):
|
160 |
+
if main_choice == "Overall":
|
161 |
+
leaderboard_table = get_dataset_csv(model_size)
|
162 |
+
elif main_choice != "Subclass":
|
163 |
+
subclass_choice = main_choice
|
164 |
+
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
165 |
+
return leaderboard_table
|
166 |
+
|
167 |
+
def get_dataset_classfier_per(
|
168 |
+
model_size: List[str],
|
169 |
+
main_choice: List[str],
|
170 |
+
):
|
171 |
+
if main_choice == "Overall":
|
172 |
+
leaderboard_table = get_dataset_csv_per(model_size)
|
173 |
+
elif main_choice != "Overall":
|
174 |
+
subclass_choice = main_choice
|
175 |
+
leaderboard_table = get_dataset_csv_sub_per(model_size, subclass_choice)
|
176 |
+
return leaderboard_table
|
177 |
+
|
178 |
+
with gr.Blocks() as demo:
|
179 |
+
gr.Markdown("<center><h1>ChineseSafe Leaderboard</h1></center>", elem_classes="markdown-text")
|
180 |
+
with gr.Row():
|
181 |
+
#gr.Image(banner_url, height=160, scale=1) # 👉 this part is for image
|
182 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
183 |
+
# gr.Textbox(_INTRODUCTION_TEXT, scale=5)
|
184 |
+
|
185 |
+
with gr.Row():
|
186 |
+
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
187 |
+
|
188 |
+
with gr.Row():
|
189 |
+
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
190 |
+
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Column(scale=0.8):
|
193 |
+
main_choice = gr.Dropdown(
|
194 |
+
choices=SPLITS,
|
195 |
+
value="Overall",
|
196 |
+
label="Type",
|
197 |
+
info="Please choose the type to display.",
|
198 |
+
)
|
199 |
+
|
200 |
+
# with gr.Column(scale=0.8):
|
201 |
+
# metric_choice = gr.Dropdown(
|
202 |
+
# choices=METRICS,
|
203 |
+
# value="None",
|
204 |
+
# label="Metric",
|
205 |
+
# info="Please choose the metric to display.",
|
206 |
+
# )
|
207 |
+
|
208 |
+
with gr.Column(scale=10):
|
209 |
+
model_choice = gr.CheckboxGroup(
|
210 |
+
choices=CLASSIFICATION["model_size"],
|
211 |
+
value=CLASSIFICATION["model_size"], # all be choosed
|
212 |
+
label="Model Size",
|
213 |
+
info="Please choose the model size to display.",
|
214 |
+
)
|
215 |
+
|
216 |
+
|
217 |
+
# with gr.Column(scale=0.8):
|
218 |
+
# subclass_choice = gr.Dropdown(
|
219 |
+
# choices=SUBCLASS,
|
220 |
+
# value="Discrimination",
|
221 |
+
# label="Subclass",
|
222 |
+
# info="Please choose the subclass to display.",
|
223 |
+
# )
|
224 |
+
|
225 |
+
|
226 |
+
#👉 this part is for csv table generatived
|
227 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
228 |
+
|
229 |
+
# with gr.TabItem("🏅 Overall Generatived", elem_id="od-benchmark-tab-table", id=1):
|
230 |
+
# dataframe = gr.components.Dataframe(
|
231 |
+
# elem_id="leaderboard-table",
|
232 |
+
# )
|
233 |
+
# #👉 this part is for csv table perplexity
|
234 |
+
# with gr.TabItem("🏅 Overall Perplexity", elem_id="od-benchmark-tab-table", id=2):
|
235 |
+
# datafram_per = gr.components.Dataframe(
|
236 |
+
# elem_id="leaderboard-table",
|
237 |
+
# )
|
238 |
+
|
239 |
+
# #👉 this part is for csv subclass table generatived
|
240 |
+
# with gr.TabItem("🏅 Subclass Generatived", elem_id="od-benchmark-tab-table", id=3):
|
241 |
+
# dataframe_sub_gen = gr.components.Dataframe(
|
242 |
+
# elem_id="leaderboard-table",
|
243 |
+
# )
|
244 |
+
|
245 |
+
# #👉 this part is for csv subclass table perplexity
|
246 |
+
# with gr.TabItem("🏅 Subclass Perplexity", elem_id="od-benchmark-tab-table", id=4):
|
247 |
+
# dataframe_sub_per = gr.components.Dataframe(
|
248 |
+
# elem_id="leaderboard-table",
|
249 |
+
# )
|
250 |
+
# ----------------- modify text -----------------
|
251 |
+
|
252 |
+
with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
|
253 |
+
dataframe_all_gen = gr.components.Dataframe(
|
254 |
+
elem_id="leaderboard-table",
|
255 |
+
)
|
256 |
+
|
257 |
+
with gr.TabItem("🏅 Multiple Choice", elem_id="od-benchmark-tab-table", id=5):
|
258 |
+
dataframe_all_per = gr.components.Dataframe(
|
259 |
+
elem_id="leaderboard-table",
|
260 |
+
)
|
261 |
+
|
262 |
+
# ----------------- modify text -----------------
|
263 |
+
with gr.Row():
|
264 |
+
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
265 |
+
|
266 |
+
with gr.Row():
|
267 |
+
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
268 |
+
|
269 |
+
|
270 |
+
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
271 |
+
|
272 |
+
# 👉 this part is for citation
|
273 |
+
# with gr.Row():
|
274 |
+
# with gr.Accordion("📙 Citation", open=False):
|
275 |
+
# gr.Textbox(
|
276 |
+
# value=_BIBTEX,
|
277 |
+
# lines=7,
|
278 |
+
# label="Copy the BibTeX snippet to cite this source",
|
279 |
+
# elem_id="citation-button",
|
280 |
+
# show_copy_button=True
|
281 |
+
# )
|
282 |
+
|
283 |
+
# this is result based on generative
|
284 |
+
# metric_choice.change(
|
285 |
+
# get_dataset_csv,
|
286 |
+
# inputs=[model_choice, metric_choice],
|
287 |
+
# outputs=dataframe,
|
288 |
+
# )
|
289 |
+
|
290 |
+
# model_choice.change(
|
291 |
+
# get_dataset_csv,
|
292 |
+
# inputs=[model_choice, metric_choice],
|
293 |
+
# outputs=dataframe,
|
294 |
+
# )
|
295 |
+
|
296 |
+
# demo.load(
|
297 |
+
# fn=get_dataset_csv,
|
298 |
+
# inputs=[model_choice, metric_choice],
|
299 |
+
# outputs=dataframe,
|
300 |
+
# )
|
301 |
+
|
302 |
+
# # this is result based on Perplexity
|
303 |
+
# metric_choice.change(
|
304 |
+
# get_dataset_csv_per,
|
305 |
+
# inputs=[model_choice, metric_choice],
|
306 |
+
# outputs=datafram_per,
|
307 |
+
# )
|
308 |
+
|
309 |
+
# model_choice.change(
|
310 |
+
# get_dataset_csv_per,
|
311 |
+
# inputs=[model_choice, metric_choice],
|
312 |
+
# outputs=datafram_per,
|
313 |
+
# )
|
314 |
+
|
315 |
+
# demo.load(
|
316 |
+
# fn=get_dataset_csv_per,
|
317 |
+
# inputs=[model_choice, metric_choice],
|
318 |
+
# outputs=datafram_per,
|
319 |
+
# )
|
320 |
+
|
321 |
+
# this is subclass result generatived
|
322 |
+
# metric_choice.change(
|
323 |
+
# get_dataset_csv_sub_gen,
|
324 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
325 |
+
# outputs=dataframe_sub_gen,
|
326 |
+
# )
|
327 |
+
|
328 |
+
# model_choice.change(
|
329 |
+
# get_dataset_csv_sub_gen,
|
330 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
331 |
+
# outputs=dataframe_sub_gen,
|
332 |
+
# )
|
333 |
+
|
334 |
+
# subclass_choice.change(
|
335 |
+
# get_dataset_csv_sub_gen,
|
336 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
337 |
+
# outputs=dataframe_sub_gen,
|
338 |
+
# )
|
339 |
+
|
340 |
+
# demo.load(
|
341 |
+
# fn=get_dataset_csv_sub_gen,
|
342 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
343 |
+
# outputs=dataframe_sub_gen,
|
344 |
+
# )
|
345 |
+
|
346 |
+
# # this is subclass result Perplexity
|
347 |
+
# # metric_choice.change(
|
348 |
+
# # get_dataset_csv_sub_per,
|
349 |
+
# # inputs=[model_choice, metric_choice, subclass_choice],
|
350 |
+
# # outputs=dataframe_sub_per,
|
351 |
+
# # )
|
352 |
+
|
353 |
+
# model_choice.change(
|
354 |
+
# get_dataset_csv_sub_per,
|
355 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
356 |
+
# outputs=dataframe_sub_per,
|
357 |
+
# )
|
358 |
+
|
359 |
+
# subclass_choice.change(
|
360 |
+
# get_dataset_csv_sub_per,
|
361 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
362 |
+
# outputs=dataframe_sub_per,
|
363 |
+
# )
|
364 |
+
|
365 |
+
# demo.load(
|
366 |
+
# fn=get_dataset_csv_sub_per,
|
367 |
+
# inputs=[model_choice, metric_choice, subclass_choice],
|
368 |
+
# outputs=dataframe_sub_per,
|
369 |
+
# )
|
370 |
+
|
371 |
+
# --------------------------- all --------------------------------
|
372 |
+
# this is all result Perplexity
|
373 |
+
|
374 |
+
main_choice.change(
|
375 |
+
get_dataset_classfier_per,
|
376 |
+
inputs=[model_choice, main_choice],
|
377 |
+
outputs=dataframe_all_per,
|
378 |
+
)
|
379 |
+
|
380 |
+
model_choice.change(
|
381 |
+
get_dataset_classfier_per,
|
382 |
+
inputs=[model_choice, main_choice],
|
383 |
+
outputs=dataframe_all_per,
|
384 |
+
)
|
385 |
+
|
386 |
+
# metric_choice.change(
|
387 |
+
# get_dataset_classfier_per,
|
388 |
+
# inputs=[model_choice, main_choice],
|
389 |
+
# outputs=dataframe_all_per,
|
390 |
+
# )
|
391 |
+
|
392 |
+
# subclass_choice.change(
|
393 |
+
# get_dataset_classfier_per,
|
394 |
+
# inputs=[model_choice, metric_choice, main_choice],
|
395 |
+
# outputs=dataframe_all_per,
|
396 |
+
# )
|
397 |
+
|
398 |
+
demo.load(
|
399 |
+
fn=get_dataset_classfier_per,
|
400 |
+
inputs=[model_choice, main_choice],
|
401 |
+
outputs=dataframe_all_per,
|
402 |
+
)
|
403 |
+
|
404 |
+
# this is all result generatived
|
405 |
+
main_choice.change(
|
406 |
+
get_dataset_classfier_gen,
|
407 |
+
inputs=[model_choice, main_choice],
|
408 |
+
outputs=dataframe_all_gen,
|
409 |
+
)
|
410 |
+
|
411 |
+
model_choice.change(
|
412 |
+
get_dataset_classfier_gen,
|
413 |
+
inputs=[model_choice, main_choice],
|
414 |
+
outputs=dataframe_all_gen,
|
415 |
+
)
|
416 |
+
|
417 |
+
# metric_choice.change(
|
418 |
+
# get_dataset_classfier_gen,
|
419 |
+
# inputs=[model_choice, metric_choice, main_choice],
|
420 |
+
# outputs=dataframe_all_gen,
|
421 |
+
# )
|
422 |
+
|
423 |
+
# subclass_choice.change(
|
424 |
+
# get_dataset_classfier_gen,
|
425 |
+
# inputs=[model_choice, metric_choice, main_choice],
|
426 |
+
# outputs=dataframe_all_gen,
|
427 |
+
# )
|
428 |
+
|
429 |
+
demo.load(
|
430 |
+
fn=get_dataset_classfier_gen,
|
431 |
+
inputs=[model_choice, main_choice],
|
432 |
+
outputs=dataframe_all_gen,
|
433 |
+
)
|
434 |
+
|
435 |
+
|
436 |
+
demo.launch()
|
437 |
+
|
assets/logo.png
ADDED
assets/text.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INTRODUCTION_TEXT = """
|
2 |
+
|
3 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif;"> <b> Welcome to the ChineseSafe Leaderboard!
|
4 |
+
On this leaderboard, we share the evaluation results of LLMs obtained by developing a brand new content moderation benchmark for Chinese. 🎉🎉🎉</b>
|
5 |
+
</span>
|
6 |
+
|
7 |
+
# Dataset
|
8 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
9 |
+
To evaluate the conformity of large language models, we present ChineseSafe, a content moderation benchmark for Chinese (Mandarin).
|
10 |
+
In this benchmark, we include 4 common types of safety issues: Crime, Ethic, Mental health, and their Variant/Homophonic words.
|
11 |
+
In particular, the benchmark is constructed as a balanced dataset, containing safe and unsafe data collected from internet resources and public datasets [1,2,3].
|
12 |
+
We hope the evaluation can provide a reference for researchers and engineers to build safe LLMs in Chinese. <br>
|
13 |
+
|
14 |
+
The leadboard is under construction and maintained by <a href="https://hongxin001.github.io/" target="_blank">Hongxin Wei's</a> research group at SUSTech.
|
15 |
+
We will release the technical report in the near future.
|
16 |
+
Comments, issues, contributions, and collaborations are all welcomed!
|
17 |
+
Email: [email protected]
|
18 |
+
</span>
|
19 |
+
""" # noqa
|
20 |
+
|
21 |
+
METRICS_TEXT = """
|
22 |
+
# Metrics
|
23 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
24 |
+
We report the results with five metrics: overall accuracy, precision/recall for safe/unsafe content.
|
25 |
+
In particular, the results are shown as <b>metric/std</b> format in the table,
|
26 |
+
where <b>std</b> indicates the standard deviation of the results obtained from different random seeds.
|
27 |
+
</span>
|
28 |
+
""" # noqa
|
29 |
+
|
30 |
+
EVALUTION_TEXT= """
|
31 |
+
# Evaluation
|
32 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
33 |
+
We evaluate the models using two methods: multiple choice (perplexity) and generation.
|
34 |
+
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
35 |
+
For generation, we use the content generated by the model to make prediction.
|
36 |
+
The following are the results of the evaluation. 👇👇👇
|
37 |
+
</span> <br><br>
|
38 |
+
|
39 |
+
|
40 |
+
""" # noqa
|
41 |
+
|
42 |
+
REFERENCE_TEXT = """
|
43 |
+
# References
|
44 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
45 |
+
[1] Sun H, Zhang Z, Deng J, et al. Safety assessment of chinese large language models[J]. arXiv preprint arXiv:2304.10436, 2023. <br>
|
46 |
+
[2] https://github.com/konsheng/Sensitive-lexicon <br>
|
47 |
+
[3] https://www.cluebenchmarks.com/static/pclue.html <br>
|
48 |
+
|
49 |
+
"""
|
50 |
+
|
51 |
+
ACKNOWLEDGEMENTS_TEXT = """
|
52 |
+
# Acknowledgements
|
53 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
54 |
+
This research is supported by "Data+AI" Data Intelligent Laboratory,
|
55 |
+
a joint lab constructed by Deepexi and Department of Statistics and Data Science at SUSTech.
|
56 |
+
We gratefully acknowledge the contributions of Prof. Bingyi Jing, Prof. Lili Yang,
|
57 |
+
and Asst. Prof.Guanhua Chen for their support throughout this project.
|
58 |
+
"""
|
59 |
+
|
60 |
+
|
61 |
+
CONTACT_TEXT = """
|
62 |
+
# Contact
|
63 |
+
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
64 |
+
The leadboard is under construction and maintained by <a href="https://hongxin001.github.io/" target="_blank">Hongxin Wei's</a> research group at SUSTech.
|
65 |
+
We will release the technical report in the near future.
|
66 |
+
Comments, issues, contributions, and collaborations are all welcomed!
|
67 |
+
Email: [email protected]
|
68 |
+
"""
|
changelog.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CHANGELOG
|
2 |
+
|
3 |
+
### Version Number
|
4 |
+
v1.0.0
|
5 |
+
2024-7-16
|
6 |
+
|
7 |
+
### Changed
|
8 |
+
- [1]feat: upload the first version
|
data/chinese_benchmark_gen.csv
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Size Accuracy/std Precision_Unsafe/std Recall_Unsafe/std Precision_Safe/std Recall_Safe/std
|
2 |
+
DeepSeek-LLM-67B-Chat >65B 76.76/0.35 73.40/0.37 84.26/0.40 81.34/0.35 69.19/0.64
|
3 |
+
Llama3-ChatQA-1.5-70B >65B 65.29/0.29 66.24/0.50 62.92/0.12 64.43/0.19 67.69/0.63
|
4 |
+
Qwen1.5-72B-Chat >65B 62.91/0.50 73.86/0.84 40.46/0.97 58.75/0.35 85.55/0.62
|
5 |
+
Opt-66B >65B 54.46/0.17 53.22/0.06 76.94/0.24 57.73/0.49 31.77/0.28
|
6 |
+
Yi-1.5-34B-Chat ~30B 60.06/0.43 58.14/0.40 72.51/0.55 63.27/0.56 47.56/0.42
|
7 |
+
Opt-30B ~30B 50.88/0.11 50.76/0.12 72.95/0.16 51.18/0.26 28.62/0.28
|
8 |
+
InternLM2-Chat-20B 10B~20B 70.21/0.55 73.30/0.70 63.79/0.43 67.82/0.45 76.65/0.67
|
9 |
+
Qwen1.5-14B 10B~20B 68.25/0.44 65.87/0.37 76.02/0.72 71.51/0.59 60.44/0.20
|
10 |
+
Baichuan2-13B-Chat 10B~20B 62.86/0.31 64.17/0.33 58.61/0.80 61.75/0.30 67.13/0.56
|
11 |
+
Ziya2-13B-Chat 10B~20B 53.40/0.43 53.33/0.38 56.18/0.41 53.48/0.53 50.62/0.61
|
12 |
+
Opt-13B 10B~20B 50.18/0.26 50.29/0.20 69.97/0.37 49.94/0.47 30.22/0.31
|
13 |
+
Gemma-1.1-7B 5B~10B 71.70/0.26 68.66/0.37 80.11/0.05 76.00/0.09 63.26/0.47
|
14 |
+
DeepSeek-LLM-7B-Chat 5B~10B 71.63/0.17 69.50/0.15 77.33/0.67 74.33/0.41 65.90/0.38
|
15 |
+
GLM-4-9B-Chat 5B~10B 70.96/0.23 82.15/0.55 53.73/0.48 65.50/0.18 88.27/0.41
|
16 |
+
Mistral-7B 5B~10B 70.41/0.41 68.55/0.52 75.67/0.22 72.71/0.26 65.12/0.58
|
17 |
+
Qwen1.5-7B-Chat 5B~10B 70.36/0.39 64.66/0.27 90.09/0.57 83.55/0.82 50.53/0.18
|
18 |
+
Yi-1.5-9B-Chat 5B~10B 62.12/0.38 64.42/0.42 54.53/0.43 60.43/0.36 69.75/0.37
|
19 |
+
Llama3-ChatQA-1.5-8B 5B~10B 61.28/0.40 57.63/0.20 85.84/0.43 72.02/0.95 36.61/0.54
|
20 |
+
Baichuan2-7B 5B~10B 59.43/0.24 72.06/0.66 31.11/0.40 55.95/0.12 87.89/0.20
|
21 |
+
InternLM2-chat-7B 5B~10B 58.79/0.09 62.70/0.19 43.88/0.17 56.68/0.14 73.77/0.13
|
22 |
+
GPT-J-6B 5B~10B 52.65/0.32 52.42/0.32 62.00/0.42 52.99/0.37 43.21/0.92
|
23 |
+
Opt-6.7B 5B~10B 50.00/0.11 50.17/0.17 64.70/0.35 49.69/0.04 35.18/0.44
|
24 |
+
GPT-4o API 73.78/0.30 97.75/0.13 48.66/0.04 65.84/0.55 98.88/0.04
|
25 |
+
GPT-4-Turbo API 71.67/0.17 80.13/0.64 57.59/0.69 66.93/0.44 85.74/0.35
|
26 |
+
Pespective API 69.28/0.32 69.96/0.79 67.49/0.32 68.64/0.32 71.06/0.43
|
27 |
+
GPT-3.5 API 64.70/0.44 76.12/0.55 42.79/0.64 60.24/0.76 86.59/0.32
|
data/chinese_benchmark_per.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Size Accuracy/std Precision_Unsafe/std Recall_Unsafe/std Precision_Safe/std Recall_Safe/std
|
2 |
+
DeepSeek-LLM-67B-Chat >65B 68.08/0.35 94.80/0.83 38.40/0.43 61.27/0.26 97.88/0.36
|
3 |
+
Qwen1.5-72B-Chat >65B 63.67/0.46 58.27/0.32 96.84/0.13 90.51/0.57 30.34/0.80
|
4 |
+
Opt-66B >65B 59.93/0.41 56.52/0.37 86.87/0.59 71.36/0.78 32.86/0.74
|
5 |
+
Llama3-ChatQA-1.5-70B >65B 40.41/0.29 33.86/0.75 19.84/0.75 43.13/0.25 61.08/0.37
|
6 |
+
Yi-1.5-34B-Chat ~30B 66.02/0.22 80.13/0.55 42.82/0.25 60.86/0.16 89.33/0.41
|
7 |
+
Opt-30B ~30B 53.82/0.03 54.42/0.21 48.32/0.20 53.34/0.11 59.34/0.27
|
8 |
+
Baichuan2-13B-Chat 10B~20B 70.43/0.39 65.81/0.38 85.34/0.63 79.02/0.63 55.46/0.47
|
9 |
+
Qwen1.5-14B 10B~20B 61.29/0.40 57.02/0.32 92.43/0.55 79.80/1.05 30.02/0.47
|
10 |
+
Ziya2-13B-Chat 10B~20B 55.25/0.26 59.24/0.37 34.30/0.11 53.61/0.26 76.29/0.39
|
11 |
+
InternLM2-Chat-20B 10B~20B 53.67/0.16 79.00/0.66 10.30/0.60 51.90/0.11 97.25/0.26
|
12 |
+
Opt-13B 10B~20B 49.31/0.31 37.77/3.57 1.76/0.16 49.59/0.23 97.08/0.29
|
13 |
+
Gemma-1.1-7B 5B~10B 64.32/0.68 59.98/0.58 86.60/0.35 75.70/0.80 41.95/0.93
|
14 |
+
Qwen1.5-7B-Chat 5B~10B 62.48/0.54 59.06/0.48 81.92/0.50 70.28/0.65 42.96/0.81
|
15 |
+
Yi-1.5-9B-Chat 5B~10B 60.35/0.52 79.47/1.37 28.16/0.33 56.22/0.39 92.69/0.59
|
16 |
+
DeepSeek-LLM-7B-Chat 5B~10B 56.79/0.19 84.83/1.23 16.77/0.09 53.70/0.15 96.99/0.27
|
17 |
+
GPT-J-6B 5B~10B 55.98/0.42 80.27/1.42 16.11/0.86 53.26/0.23 96.03/0.20
|
18 |
+
Baichuan2-7B 5B~10B 53.99/0.51 62.89/1.57 19.96/0.88 52.31/0.30 88.18/0.23
|
19 |
+
GLM-4-9B-Chat 5B~10B 50.03/0.15 50.07/0.13 99.31/0.22 44.12/9.01 0.52/0.04
|
20 |
+
InternLM2-Chat-7B 5B~10B 49.49/0.11 42.16/1.58 2.15/0.31 49.68/0.13 97.06/0.25
|
21 |
+
Opt-6.7B 5B~10B 48.54/0.43 49.24/0.31 86.62/1.03 43.40/1.18 10.30/0.55
|
22 |
+
Mistral-7B 5B~10B 42.99/0.06 39.54/0.47 26.01/0.69 44.69/0.11 60.05/0.50
|
23 |
+
Llama3-ChatQA-1.5-8B 5B~10B 42.11/0.29 37.46/0.85 23.20/0.89 44.20/0.09 61.11/0.57
|
data/subclass_gen.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
|
2 |
+
DeepSeek-LLM-67B-Chat,>65B,0.7897,0.7454,0.8652,0.8482,0.7832,0.9726,0.6603,0.6751,0.6011,0.8344,0.7978,0.932,0.8367,0.78,0.9497,0.8449,0.769,0.9767,0.7985,0.7493,0.8825,0.6171,0.6366,0.5125,0.8258,0.7583,0.9401,0.7387,0.7276,0.7596
|
3 |
+
Qwen1.5-72B-Chat,>65B,0.5998,0.693,0.3298,0.8005,0.8477,0.7444,0.4697,0.3314,0.0703,0.6671,0.812,0.506,0.7676,0.8369,0.6803,0.7069,0.7895,0.5476,0.5825,0.6666,0.2918,0.4697,0.3186,0.0668,0.7076,0.7867,0.546,0.5283,0.5803,0.1942
|
4 |
+
Opt-66B,>65B,0.4866,0.482,0.682,0.5174,0.5203,0.7258,0.5579,0.5338,0.8237,0.5646,0.5728,0.7868,0.5385,0.535,0.7659,0.5571,0.5309,0.8257,0.5414,0.5199,0.7954,0.5354,0.5181,0.7801,0.5376,0.515,0.7909,0.5079,0.5041,0.7185
|
5 |
+
Llama3-ChatQA-1.5-70B,>65B,0.6682,0.6617,0.6566,0.6859,0.6932,0.6922,0.6079,0.6187,0.5348,0.6548,0.7024,0.6342,0.6861,0.6945,0.6928,0.7029,0.6853,0.7281,0.6211,0.6242,0.5599,0.6105,0.6189,0.5397,0.7134,0.6873,0.7493,0.59,0.6072,0.4996
|
6 |
+
Yi-1.5-34B-Chat,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6708,0.8646,0.7046,0.6528,0.9053,0.7084,0.6383,0.9309,0.5928,0.5672,0.6961,0.4467,0.4308,0.3972,0.6956,0.6281,0.9097,0.5182,0.515,0.5425
|
7 |
+
Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.5517,0.7422,0.5108,0.5163,0.7304,0.5161,0.5039,0.7618,0.513,0.5009,0.7578,0.4956,0.4908,0.719,0.5119,0.4977,0.7583,0.4958,0.4955,0.7134
|
8 |
+
Baichuan2-13B-Chat,10B~20B,0.6337,0.6402,0.5755,0.7188,0.7164,0.7457,0.5185,0.5189,0.3417,0.7341,0.7487,0.7703,0.7033,0.7091,0.7143,0.6742,0.6712,0.6575,0.5657,0.5728,0.434,0.6151,0.6264,0.5371,0.6515,0.65,0.6089,0.5532,0.5707,0.414
|
9 |
+
Qwen1.5-14B,10B~20B,0.7099,0.6657,0.8141,0.7897,0.7205,0.9615,0.5669,0.5657,0.5226,0.7776,0.7373,0.9181,0.7571,0.7073,0.897,0.7862,0.7044,0.97,0.6421,0.6225,0.6757,0.5014,0.4893,0.3888,0.7563,0.6869,0.9116,0.5499,0.5538,0.4889
|
10 |
+
Ziya2-13B-Chat,10B~20B,0.5403,0.5272,0.5731,0.6597,0.6313,0.8034,0.3259,0.2145,0.1373,0.673,0.6631,0.8101,0.6526,0.6282,0.7886,0.5583,0.5437,0.6097,0.3987,0.3541,0.2823,0.529,0.5194,0.5497,0.5377,0.5208,0.5678,0.4567,0.4484,0.4035
|
11 |
+
InternLM2-Chat-20B,10B~20B,0.6819,0.7156,0.5781,0.7661,0.7819,0.7518,0.5506,0.5823,0.3134,0.8061,0.8182,0.8271,0.807,0.7993,0.832,0.8128,0.7876,0.8453,0.7037,0.7305,0.6224,0.6092,0.6548,0.4308,0.7815,0.7702,0.7821,0.5613,0.6058,0.3396
|
12 |
+
Opt-13B,10B~20B,0.4746,0.4724,0.637,0.5147,0.519,0.7014,0.5146,0.5059,0.7153,0.5333,0.5557,0.7126,0.5261,0.5278,0.7228,0.5187,0.506,0.7257,0.5232,0.5081,0.7367,0.5218,0.5094,0.7314,0.4956,0.4856,0.6828,0.4722,0.4773,0.6264
|
13 |
+
Gemma-1.1-7B,5B~10B,0.7849,0.7205,0.9139,0.8081,0.7454,0.9485,0.6024,0.6084,0.5413,0.7854,0.758,0.8894,0.8017,0.7436,0.9353,0.8215,0.7367,0.9884,0.6669,0.6543,0.673,0.5811,0.5858,0.4976,0.7831,0.7167,0.9127,0.6684,0.6638,0.6754
|
14 |
+
Qwen1.5-7B-Chat,5B~10B,0.6885,0.6347,0.8535,0.7677,0.6891,0.9938,0.6929,0.6404,0.8588,0.7791,0.7151,0.9869,0.7653,0.6889,0.988,0.7485,0.6659,0.9746,0.684,0.6317,0.8443,0.7267,0.6564,0.929,0.7473,0.662,0.9772,0.5545,0.5496,0.5778
|
15 |
+
Yi-1.5-9B-Chat,5B~10B,0.7025,0.6913,0.7058,0.7032,0.7106,0.707,0.4533,0.3925,0.2,0.6546,0.7097,0.6172,0.7209,0.7213,0.7419,0.8197,0.7508,0.9452,0.5595,0.5666,0.4131,0.4342,0.3378,0.1591,0.7626,0.7215,0.8306,0.4057,0.2654,0.1096
|
16 |
+
DeepSeek-LLM-7B-Chat,5B~10B,0.6455,0.6405,0.6242,0.8131,0.749,0.9539,0.6146,0.6202,0.5617,0.7978,0.7642,0.9083,0.7978,0.7439,0.9236,0.7995,0.7291,0.9387,0.691,0.6715,0.7174,0.6343,0.6345,0.6017,0.7582,0.7064,0.8562,0.6311,0.6381,0.5954
|
17 |
+
GPT-J-6B,5B~10B,0.5076,0.4966,0.5752,0.5259,0.5322,0.6057,0.548,0.5343,0.6564,0.5565,0.5828,0.6522,0.5454,0.5487,0.6439,0.5365,0.5223,0.6345,0.527,0.513,0.6156,0.5365,0.5235,0.634,0.5386,0.5195,0.6408,0.4891,0.4884,0.5365
|
18 |
+
Baichuan2-7B,5B~10B,0.619,0.7508,0.3303,0.6409,0.7993,0.3973,0.5355,0.604,0.1652,0.6101,0.8093,0.3705,0.6285,0.7908,0.3739,0.6831,0.8077,0.4616,0.5551,0.6437,0.198,0.5592,0.6601,0.2106,0.683,0.8045,0.4568,0.5144,0.5518,0.1293
|
19 |
+
GLM-4-9B-Chat,5B~10B,0.7691,0.8562,0.6352,0.7669,0.868,0.6424,0.4801,0.3396,0.0518,0.8123,0.8964,0.7414,0.8671,0.8962,0.8388,0.9278,0.8991,0.9611,0.6401,0.7767,0.3713,0.5054,0.4911,0.1002,0.8728,0.8863,0.8468,0.5132,0.5544,0.1254
|
20 |
+
InternLM2-Chat-7B,5B~10B,0.53,0.5321,0.3028,0.6981,0.7292,0.6512,0.5182,0.5207,0.2824,0.6362,0.7192,0.5431,0.6717,0.7137,0.6002,0.6512,0.6763,0.551,0.5731,0.5951,0.3902,0.5205,0.5203,0.2849,0.6413,0.6626,0.5289,0.4783,0.4438,0.2061
|
21 |
+
Opt-6.7B,5B~10B,0.4717,0.4691,0.6091,0.5087,0.5153,0.6691,0.4931,0.4895,0.6491,0.5308,0.5556,0.6899,0.5215,0.5249,0.6922,0.4969,0.4902,0.6595,0.4803,0.4756,0.6266,0.488,0.4842,0.6406,0.4819,0.4741,0.6315,0.4627,0.4684,0.5853
|
22 |
+
Mistral-7B,5B~10B,0.7069,0.6749,0.7706,0.7521,0.7161,0.8533,0.5826,0.5868,0.5167,0.7142,0.7222,0.7711,0.7599,0.7205,0.8679,0.7956,0.7205,0.9509,0.6748,0.6547,0.7042,0.6139,0.6127,0.5802,0.7742,0.7074,0.9103,0.6388,0.6387,0.6313
|
23 |
+
Llama3-ChatQA-1.5-8B,5B~10B,0.6114,0.5657,0.8761,0.6276,0.5904,0.885,0.5978,0.5613,0.844,0.6056,0.6016,0.8128,0.6113,0.5825,0.8521,0.6365,0.5805,0.9258,0.6062,0.5625,0.8663,0.6034,0.5629,0.8569,0.6223,0.5694,0.903,0.5658,0.5447,0.7752
|
data/subclass_per.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
|
2 |
+
DeepSeek-LLM-67B-Chat,>65B,0.6948,0.9451,0.3989,0.6447,0.9375,0.3259,0.5122,0.5824,0.033,0.7673,0.9695,0.5903,0.6865,0.9516,0.4092,0.899,0.9725,0.8159,0.66,0.9341,0.326,0.5479,0.8184,0.1017,0.8777,0.9706,0.7709,0.5142,0.6736,0.0456
|
3 |
+
Qwen1.5-72B-Chat,>65B,0.6479,0.581,0.9985,0.6609,0.6019,0.9938,0.6472,0.5837,0.9906,0.5928,0.5895,0.8276,0.6544,0.5996,0.9796,0.6488,0.5823,0.9987,0.6448,0.5792,0.9932,0.6255,0.5712,0.9493,0.6433,0.5763,0.9951,0.6485,0.5872,0.9874
|
4 |
+
Opt-66B,>65B,0.645,0.5831,0.9572,0.3981,0.417,0.4471,0.6667,0.5971,0.9953,0.6232,0.6095,0.8551,0.4854,0.4984,0.6176,0.652,0.5874,0.9698,0.6511,0.5859,0.9706,0.6604,0.5926,0.9853,0.6556,0.586,0.9846,0.655,0.5943,0.9665
|
5 |
+
Llama3-ChatQA-1.5-70B,>65B,0.3666,0.2082,0.1069,0.339,0.169,0.0752,0.3147,0.0148,0.0059,0.2947,0.075,0.0261,0.7758,0.7167,0.9293,0.5528,0.5482,0.4877,0.3396,0.111,0.0507,0.3207,0.0374,0.0156,0.4392,0.3806,0.2524,0.3214,0.0614,0.0253
|
6 |
+
Yi-1.5-34B-Chat,~30B,0.7139,0.8341,0.5176,0.7722,0.8735,0.6482,0.475,0.2581,0.0357,0.7162,0.8717,0.5603,0.6206,0.7912,0.353,0.8816,0.8938,0.8601,0.6412,0.7813,0.3672,0.497,0.4306,0.0769,0.8472,0.8832,0.7889,0.4818,0.3646,0.0576
|
7 |
+
Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.5798,0.6281,0.5559,0.357,0.2405,0.1185,0.406,0.3224,0.1945,0.6203,0.6061,0.633,0.6188,0.6076,0.6293,0.6031,0.5886,0.5976,0.6244,0.6184,0.6415
|
8 |
+
Baichuan2-13B-Chat,10B~20B,0.7346,0.6715,0.8932,0.7703,0.7043,0.9491,0.6303,0.6129,0.6785,0.7435,0.7152,0.8777,0.779,0.7088,0.9649,0.7677,0.6883,0.9601,0.6763,0.6388,0.7738,0.6359,0.6149,0.6904,0.7096,0.6554,0.8436,0.7306,0.6762,0.8788
|
9 |
+
Qwen1.5-14B,10B~20B,0.625,0.5683,0.964,0.6549,0.5977,0.9932,0.5983,0.5571,0.9038,0.6561,0.6193,0.9535,0.6592,0.6005,0.9994,0.6382,0.5759,0.9897,0.5579,0.53,0.8275,0.5009,0.4938,0.7077,0.6256,0.566,0.9705,0.6063,0.5643,0.914
|
10 |
+
Ziya2-13B-Chat,10B~20B,0.6322,0.6632,0.502,0.381,0.0822,0.0212,0.4263,0.2557,0.086,0.4352,0.4474,0.1651,0.612,0.6721,0.4744,0.812,0.7741,0.8691,0.4904,0.4516,0.2102,0.5309,0.5403,0.2964,0.7186,0.7235,0.6777,0.4811,0.4512,0.2021
|
11 |
+
InternLM2-Chat-20B,10B~20B,0.5184,0.5912,0.0441,0.4754,0.0222,0.0006,0.4929,0.0222,0.0006,0.4744,0.7043,0.0573,0.605,0.904,0.256,0.5265,0.6774,0.0625,0.5689,0.8292,0.146,0.5046,0.4073,0.0202,0.7142,0.9352,0.44,0.498,0.4041,0.0196
|
12 |
+
Opt-13B,10B~20B,0.5011,0.0392,0.0015,0.4792,0.0695,0.0018,0.4958,0,0,0.4492,0.237,0.0055,0.4897,0.5438,0.0249,0.4996,0.0333,0.0006,0.5037,0.1931,0.0055,0.5454,0.8065,0.0965,0.5155,0.499,0.0228,0.5016,0.4815,0.0203
|
13 |
+
Gemma-1.1-7B,5B~10B,0.6885,0.6193,0.9389,0.7201,0.6502,0.9795,0.6709,0.6133,0.8985,0.7171,0.6709,0.9421,0.5993,0.5861,0.7426,0.7164,0.634,0.9953,0.6316,0.5872,0.8235,0.5207,0.5098,0.595,0.6874,0.616,0.9415,0.6164,0.5853,0.7856
|
14 |
+
Qwen1.5-7B-Chat,5B~10B,0.6415,0.5933,0.8439,0.7295,0.6542,0.9987,0.5495,0.5352,0.6535,0.7415,0.6808,0.9875,0.7286,0.6545,0.9955,0.7167,0.6339,0.9966,0.6122,0.5749,0.784,0.4866,0.4788,0.5265,0.6887,0.6165,0.9449,0.4276,0.4219,0.4072
|
15 |
+
Yi-1.5-9B-Chat,5B~10B,0.7089,0.8612,0.4825,0.5418,0.7129,0.1741,0.4846,0.2932,0.0308,0.5376,0.7743,0.2115,0.6185,0.8236,0.3254,0.818,0.9011,0.7057,0.5819,0.7416,0.2207,0.4893,0.3279,0.0365,0.7959,0.8937,0.6572,0.477,0.2414,0.0233
|
16 |
+
DeepSeek-LLM-7B-Chat,5B~10B,0.5078,0.4247,0.0246,0.5288,0.7841,0.1076,0.4923,0.0435,0.0019,0.5924,0.9137,0.2765,0.6125,0.9021,0.2737,0.6802,0.9215,0.3786,0.542,0.7419,0.0938,0.503,0.3766,0.0194,0.7217,0.9323,0.4588,0.4987,0.4142,0.0238
|
17 |
+
GPT-J-6B,5B~10B,0.4991,0.3302,0.0256,0.4649,0,0,0.4979,0.3838,0.0323,0.4836,0.6888,0.0916,0.9195,0.9496,0.8899,0.4914,0.1924,0.0123,0.5186,0.5541,0.0647,0.5155,0.5553,0.0641,0.5632,0.7398,0.1505,0.5932,0.8229,0.2323
|
18 |
+
Baichuan2-7B,5B~10B,0.4946,0.3641,0.0834,0.475,0.4213,0.0801,0.4753,0.2928,0.0516,0.442,0.4168,0.0681,0.8239,0.8619,0.7567,0.4889,0.3678,0.0737,0.4868,0.339,0.0659,0.478,0.2945,0.0538,0.6055,0.6911,0.3029,0.4752,0.3189,0.0577
|
19 |
+
GLM-4-9B-Chat,5B~10B,0.4974,0.4928,0.9986,0.5202,0.5158,0.9994,0.4984,0.4957,0.9914,0.5521,0.5483,0.9989,0.5179,0.5152,0.9918,0.4992,0.4944,1,0.4923,0.4899,0.9892,0.478,0.484,0.9541,0.4924,0.4881,0.9958,0.5039,0.5006,0.9928
|
20 |
+
InternLM2-Chat-7B,5B~10B,0.4988,0,0,0.4767,0,0,0.4943,0,0,0.4453,0.0513,0.0011,0.5829,0.8965,0.21,0.4977,0,0,0.4997,0.0278,0.0007,0.4964,0,0,0.5026,0,0,0.4901,0.0278,0.0006
|
21 |
+
Opt-6.7B,5B~10B,0.5189,0.5038,0.9645,0.3756,0.4266,0.6456,0.5227,0.5083,0.9638,0.549,0.5504,0.9314,0.2606,0.3276,0.4205,0.4833,0.4847,0.8892,0.5274,0.508,0.9831,0.5244,0.508,0.971,0.5105,0.4973,0.9551,0.5322,0.5159,0.9757
|
22 |
+
Mistral-7B,5B~10B,0.4091,0.3399,0.2241,0.3013,0.0672,0.0286,0.3093,0.0548,0.0246,0.3554,0.3176,0.1618,0.4671,0.473,0.3538,0.62,0.6022,0.655,0.432,0.3832,0.2701,0.3362,0.1517,0.0771,0.6338,0.6081,0.6844,0.3814,0.2943,0.1744
|
23 |
+
Llama3-ChatQA-1.5-8B,5B~10B,0.387,0.2816,0.1665,0.3232,0.1355,0.0603,0.3054,0.011,0.0045,0.292,0.0948,0.0354,0.7946,0.7193,0.9821,0.5375,0.5306,0.4746,0.3702,0.2367,0.1312,0.318,0.0621,0.0276,0.4823,0.4562,0.3594,0.3398,0.1632,0.0793
|
requirements.txt
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
altair==5.1.2
|
3 |
+
annotated-types==0.6.0
|
4 |
+
anyio==3.7.1
|
5 |
+
attrs==23.1.0
|
6 |
+
certifi==2023.11.17
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
colorama==0.4.6
|
10 |
+
contourpy==1.2.0
|
11 |
+
cycler==0.12.1
|
12 |
+
exceptiongroup==1.1.3
|
13 |
+
fastapi==0.104.1
|
14 |
+
ffmpy==0.3.1
|
15 |
+
filelock==3.13.1
|
16 |
+
fonttools==4.45.0
|
17 |
+
fsspec==2023.10.0
|
18 |
+
gradio==4.4.1
|
19 |
+
gradio_client==0.7.0
|
20 |
+
h11==0.14.0
|
21 |
+
httpcore==1.0.2
|
22 |
+
httpx==0.25.1
|
23 |
+
huggingface-hub==0.19.4
|
24 |
+
idna==3.4
|
25 |
+
importlib-resources==6.1.1
|
26 |
+
Jinja2==3.1.2
|
27 |
+
jsonschema==4.20.0
|
28 |
+
jsonschema-specifications==2023.11.1
|
29 |
+
kiwisolver==1.4.5
|
30 |
+
markdown-it-py==3.0.0
|
31 |
+
MarkupSafe==2.1.3
|
32 |
+
matplotlib==3.8.2
|
33 |
+
mdurl==0.1.2
|
34 |
+
numpy==1.26.2
|
35 |
+
orjson==3.9.10
|
36 |
+
packaging==23.2
|
37 |
+
pandas==2.1.3
|
38 |
+
Pillow==10.1.0
|
39 |
+
pydantic==2.5.1
|
40 |
+
pydantic_core==2.14.3
|
41 |
+
pydub==0.25.1
|
42 |
+
Pygments==2.17.1
|
43 |
+
pyparsing==3.1.1
|
44 |
+
python-dateutil==2.8.2
|
45 |
+
python-multipart==0.0.6
|
46 |
+
pytz==2023.3.post1
|
47 |
+
PyYAML==6.0.1
|
48 |
+
referencing==0.31.0
|
49 |
+
requests==2.31.0
|
50 |
+
rich==13.7.0
|
51 |
+
rpds-py==0.13.1
|
52 |
+
semantic-version==2.10.0
|
53 |
+
shellingham==1.5.4
|
54 |
+
six==1.16.0
|
55 |
+
sniffio==1.3.0
|
56 |
+
starlette==0.27.0
|
57 |
+
tomlkit==0.12.0
|
58 |
+
toolz==0.12.0
|
59 |
+
tqdm==4.66.1
|
60 |
+
typer==0.9.0
|
61 |
+
typing_extensions==4.8.0
|
62 |
+
tzdata==2023.3
|
63 |
+
urllib3==2.1.0
|
64 |
+
uvicorn==0.24.0.post1
|
65 |
+
websockets==11.0.3
|