Adding submit model instructions and route
Browse files- .gitignore +2 -1
- app.py +74 -1
- static/figures/cardinal.svg +629 -538
- static/figures/ordinal.svg +627 -536
- static/leaderboard.csv +19 -17
- templates/about.html +1 -1
- templates/failed_submission.html +221 -0
- templates/index.html +19 -2
- templates/model_detail.html +1 -1
- templates/model_submitted.html +223 -0
- templates/new_model.html +312 -0
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
.idea/
|
2 |
__pycache__/*
|
3 |
-
copy_data.sh
|
|
|
|
1 |
.idea/
|
2 |
__pycache__/*
|
3 |
+
copy_data.sh
|
4 |
+
uploads/*
|
app.py
CHANGED
@@ -1,9 +1,21 @@
|
|
1 |
-
|
2 |
import pandas as pd
|
3 |
import utils
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
app = Flask(__name__)
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
7 |
|
8 |
@app.route('/')
|
9 |
def index():
|
@@ -55,5 +67,66 @@ def model_detail(model_name):
|
|
55 |
def about():
|
56 |
return render_template('about.html')
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if __name__ == '__main__':
|
59 |
app.run(host='0.0.0.0', port=7860, debug=True)
|
|
|
1 |
+
import os
|
2 |
import pandas as pd
|
3 |
import utils
|
4 |
+
import base64
|
5 |
+
import shutil
|
6 |
+
import zipfile
|
7 |
+
from flask import Flask, render_template, request, redirect, url_for
|
8 |
+
from postmarker.core import PostmarkClient
|
9 |
+
from werkzeug.utils import secure_filename
|
10 |
|
11 |
app = Flask(__name__)
|
12 |
+
app.config['UPLOAD_FOLDER'] = 'uploads' # Directory where files will be stored
|
13 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
14 |
+
app.config['ALLOWED_EXTENSIONS'] = {'zip'}
|
15 |
|
16 |
+
def allowed_file(filename):
|
17 |
+
return '.' in filename and \
|
18 |
+
filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
|
19 |
|
20 |
@app.route('/')
|
21 |
def index():
|
|
|
67 |
def about():
|
68 |
return render_template('about.html')
|
69 |
|
70 |
+
@app.route('/new_model')
|
71 |
+
def new_model():
|
72 |
+
return render_template('new_model.html')
|
73 |
+
|
74 |
+
@app.route('/model_submitted')
|
75 |
+
def model_submitted():
|
76 |
+
return render_template('model_submitted.html')
|
77 |
+
|
78 |
+
@app.route('/failed_submission')
|
79 |
+
def failed_submission():
|
80 |
+
return render_template('failed_submission.html')
|
81 |
+
|
82 |
+
|
83 |
+
@app.route('/submit_model', methods=['POST'])
|
84 |
+
def submit_model():
|
85 |
+
model_name = request.form['model_name']
|
86 |
+
pull_request_link = request.form['pull_request_link']
|
87 |
+
email = request.form['email']
|
88 |
+
description = request.form['description']
|
89 |
+
|
90 |
+
# Handle ZIP file upload
|
91 |
+
if 'model_files' not in request.files:
|
92 |
+
return redirect(url_for('failed_submission'))
|
93 |
+
|
94 |
+
file = request.files['model_files']
|
95 |
+
|
96 |
+
if file and allowed_file(file.filename):
|
97 |
+
filename = secure_filename(file.filename)
|
98 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
99 |
+
file.save(file_path)
|
100 |
+
|
101 |
+
# Read the file content and encode it in base64
|
102 |
+
with open(file_path, 'rb') as f:
|
103 |
+
file_content = base64.b64encode(f.read()).decode('ascii')
|
104 |
+
|
105 |
+
# Set up Postmark email client
|
106 |
+
postmark = PostmarkClient(server_token=os.getenv('POSTMARK_SERVER_API'))
|
107 |
+
|
108 |
+
# Send the email with the attachment
|
109 |
+
postmark.emails.send(
|
110 |
+
From='[email protected]',
|
111 |
+
To='[email protected]',
|
112 |
+
Subject=f'Stick to Your Role! Model Submission: {model_name}',
|
113 |
+
HtmlBody=f"""
|
114 |
+
<p><strong>Model Name:</strong> {model_name}</p>
|
115 |
+
<p><strong>Pull Request Link:</strong> {pull_request_link}</p>
|
116 |
+
<p><strong>Email:</strong> {email}</p>
|
117 |
+
<p><strong>Description:</strong> {description}</p>
|
118 |
+
""",
|
119 |
+
Attachments=[{
|
120 |
+
'Name': filename,
|
121 |
+
'Content': file_content,
|
122 |
+
'ContentType': 'application/zip'
|
123 |
+
}]
|
124 |
+
)
|
125 |
+
else:
|
126 |
+
return redirect(url_for('failed_submission'))
|
127 |
+
|
128 |
+
return redirect(url_for('model_submitted'))
|
129 |
+
|
130 |
+
|
131 |
if __name__ == '__main__':
|
132 |
app.run(host='0.0.0.0', port=7860, debug=True)
|
static/figures/cardinal.svg
CHANGED
static/figures/ordinal.svg
CHANGED
static/leaderboard.csv
CHANGED
@@ -1,18 +1,20 @@
|
|
1 |
Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,Separability,CFI,SRMR,RMSEA
|
2 |
-
phi-3-mini-128k-instruct,0.
|
3 |
-
phi-3-medium-128k-instruct,0.
|
4 |
-
Mistral-7B-Instruct-v0.1,0.
|
5 |
-
Mistral-7B-Instruct-v0.2,0.
|
6 |
-
Mistral-7B-Instruct-v0.3,0.
|
7 |
-
Mixtral-8x7B-Instruct-v0.1,0.
|
8 |
-
Mixtral-8x22B-Instruct-v0.1,0.
|
9 |
-
command_r_plus,0.
|
10 |
-
llama_3_8b_instruct,0.
|
11 |
-
llama_3_70b_instruct,0.
|
12 |
-
llama_3.1_8b_instruct,0.
|
13 |
-
llama_3.1_70b_instruct,0.
|
14 |
-
Qwen2-7B-Instruct,0.
|
15 |
-
Qwen2-72B-Instruct,0.
|
16 |
-
gpt-3.5-turbo-0125,0.
|
17 |
-
gpt-4o-0513,0.
|
18 |
-
|
|
|
|
|
|
1 |
Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,Separability,CFI,SRMR,RMSEA
|
2 |
+
phi-3-mini-128k-instruct,0.32853223593964337,0.4571976280473622,0.039299993295009855,0.281800547806919,0.963768115942029,0.7509527777777777,0.25489166666666674,0.22045000000000003
|
3 |
+
phi-3-medium-128k-instruct,0.34224965706447186,0.46871557360419164,0.09692037989916814,0.2651981204439735,0.9975845410628019,0.6727694444444445,0.2984500000000001,0.2759472222222221
|
4 |
+
Mistral-7B-Instruct-v0.1,0.19958847736625512,0.38323622857524176,0.027216280472015988,0.2829498135031582,0.995169082125604,0.500288888888889,0.45314444444444446,0.4191027777777777
|
5 |
+
Mistral-7B-Instruct-v0.2,0.38545953360768176,0.4692343788574553,0.14417876497818388,0.265188983528973,1.0,0.5787944444444445,0.35010277777777776,0.3171083333333333
|
6 |
+
Mistral-7B-Instruct-v0.3,0.2702331961591221,0.4168826678339619,0.07960539866974455,0.2742399030139009,0.9975845410628019,0.5231444444444444,0.4214972222222223,0.3914694444444443
|
7 |
+
Mixtral-8x7B-Instruct-v0.1,0.4746227709190672,0.5307045793457128,0.21473356319081474,0.2624402608740656,1.0,0.6766166666666665,0.25611666666666666,0.24065277777777772
|
8 |
+
Mixtral-8x22B-Instruct-v0.1,0.2791495198902606,0.41811429894732177,0.1414001940345544,0.2548838005881672,0.9654589371980676,0.45902777777777776,0.4849916666666666,0.4871833333333333
|
9 |
+
command_r_plus,0.5761316872427983,0.6136142726835458,0.3429686514651868,0.23811982320641845,0.963768115942029,0.7772111111111112,0.17755277777777778,0.17465277777777777
|
10 |
+
llama_3_8b_instruct,0.49108367626886146,0.5571604188191388,0.24527785038654715,0.245806400289881,0.961352657004831,0.7348277777777779,0.20952222222222228,0.20751944444444437
|
11 |
+
llama_3_70b_instruct,0.718792866941015,0.7573878472446817,0.607020698814379,0.18525883672204868,1.0,0.8298166666666668,0.10965277777777771,0.14649722222222217
|
12 |
+
llama_3.1_8b_instruct,0.5521262002743484,0.6056589663453942,0.4295080949846363,0.22060228669473025,0.9710144927536233,0.6379333333333334,0.3225500000000001,0.3328972222222223
|
13 |
+
llama_3.1_70b_instruct,0.7517146776406035,0.78874072958529,0.691365862744007,0.1709718847084183,0.9944444444444444,0.8203805555555554,0.14023055555555552,0.17041944444444446
|
14 |
+
Qwen2-7B-Instruct,0.4465020576131687,0.5256131964101429,0.25108519506513916,0.25776537005719313,0.9855072463768116,0.6248583333333334,0.32358611111111113,0.3028361111111111
|
15 |
+
Qwen2-72B-Instruct,0.5802469135802469,0.6858608495773215,0.6465993243020925,0.20297742879025626,0.9833333333333333,0.5559722222222221,0.3575638888888889,0.39241388888888884
|
16 |
+
gpt-3.5-turbo-0125,0.22565157750342937,0.4028828123262879,0.08240359836763214,0.28728574920060357,1.0,0.4998916666666666,0.47583055555555553,0.4404444444444445
|
17 |
+
gpt-4o-0513,0.705761316872428,0.707844597747704,0.5122163952167618,0.19201420113771173,1.0,0.7998694444444445,0.14606111111111109,0.1400583333333334
|
18 |
+
gpt-4o-mini-2024-07-18,0.37517146776406035,0.4740062039155729,0.13575309046266867,0.2707065266105181,1.0,0.6141777777777777,0.32648055555555555,0.29394722222222214
|
19 |
+
Mistral-Large-Instruct-2407,0.7613168724279836,0.8046038845509005,0.7644582301049158,0.16944638941325085,0.994806763285024,0.7604888888888888,0.18767499999999993,0.21457222222222228
|
20 |
+
dummy,0.14609053497942384,0.3585809973377891,-0.009004148398032956,0.2928877637010999,1.0,0.5076361111111111,0.4973388888888889,0.4541638888888889
|
templates/about.html
CHANGED
@@ -349,7 +349,7 @@ their expression of that value).
|
|
349 |
</p>
|
350 |
</div>
|
351 |
<div class="back-button">
|
352 |
-
<a href="{{ url_for('index') }}" class="custom-button mt-3">
|
353 |
</div>
|
354 |
<div class="citation-section">
|
355 |
<p>If you found this project useful, please cite our related paper:</p>
|
|
|
349 |
</p>
|
350 |
</div>
|
351 |
<div class="back-button">
|
352 |
+
<a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
|
353 |
</div>
|
354 |
<div class="citation-section">
|
355 |
<p>If you found this project useful, please cite our related paper:</p>
|
templates/failed_submission.html
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Stick To Your Role! About</title>
|
7 |
+
<!-- Include Bootstrap CSS for styling -->
|
8 |
+
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/css/bootstrap.min.css">
|
9 |
+
<!-- Include DataTables CSS -->
|
10 |
+
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/dataTables.bootstrap5.min.css">
|
11 |
+
<!-- Custom CSS for additional styling -->
|
12 |
+
<style>
|
13 |
+
body {
|
14 |
+
background-color: #f8f9fa;
|
15 |
+
font-family: 'Arial', sans-serif;
|
16 |
+
}
|
17 |
+
.container {
|
18 |
+
max-width: 1200px; /* Limit the width of the container */
|
19 |
+
margin: auto; /* Center the container */
|
20 |
+
padding: 20px; /* Add some padding */
|
21 |
+
background: #fff;
|
22 |
+
border-radius: 8px;
|
23 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
24 |
+
}
|
25 |
+
h1 {
|
26 |
+
color: #333;
|
27 |
+
text-align: center;
|
28 |
+
}
|
29 |
+
h2 {
|
30 |
+
color: #333;
|
31 |
+
margin-top: 30px;
|
32 |
+
text-align: center;
|
33 |
+
}
|
34 |
+
.table-responsive {
|
35 |
+
margin-top: 20px;
|
36 |
+
}
|
37 |
+
table {
|
38 |
+
border-collapse: separate;
|
39 |
+
border-spacing: 0;
|
40 |
+
font-size: 14px; /* Reduce the font size */
|
41 |
+
width: 100%;
|
42 |
+
border: none; /* Remove any default border */
|
43 |
+
}
|
44 |
+
table thead th {
|
45 |
+
background-color: #610b5d;
|
46 |
+
color: white;
|
47 |
+
border: 1px solid #dee2e6;
|
48 |
+
text-align: left;
|
49 |
+
}
|
50 |
+
table tbody tr {
|
51 |
+
background-color: #fff;
|
52 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
53 |
+
}
|
54 |
+
table tbody tr:hover {
|
55 |
+
background-color: #f1f1f1;
|
56 |
+
}
|
57 |
+
table td, table th {
|
58 |
+
padding: 10px; /* Reduce padding */
|
59 |
+
border: 1px solid #dee2e6;
|
60 |
+
}
|
61 |
+
table th:first-child {
|
62 |
+
border-top-left-radius: 10px;
|
63 |
+
}
|
64 |
+
table th:last-child {
|
65 |
+
border-top-right-radius: 10px;
|
66 |
+
}
|
67 |
+
.section{
|
68 |
+
padding-top: 19px;
|
69 |
+
text-align: left;
|
70 |
+
}
|
71 |
+
|
72 |
+
.section p {
|
73 |
+
padding-left: 150px;
|
74 |
+
padding-right: 150px;
|
75 |
+
text-indent: 2em;
|
76 |
+
margin: auto;
|
77 |
+
margin-bottom: 10px;
|
78 |
+
text-align: left;
|
79 |
+
}
|
80 |
+
|
81 |
+
.section ol, ul {
|
82 |
+
padding-left: 150px;
|
83 |
+
padding-right: 150px;
|
84 |
+
margin: auto;
|
85 |
+
margin-bottom: 20px;
|
86 |
+
margin-left: 50px;
|
87 |
+
text-align: left;
|
88 |
+
margin-top: 0px;
|
89 |
+
}
|
90 |
+
|
91 |
+
.citation-section {
|
92 |
+
width: 100%;
|
93 |
+
margin-top: 50px;
|
94 |
+
text-align: center;
|
95 |
+
}
|
96 |
+
.citation-box {
|
97 |
+
background-color: #f8f9fa;
|
98 |
+
border: 1px solid #dee2e6;
|
99 |
+
border-radius: 8px;
|
100 |
+
padding: 10px;
|
101 |
+
margin-top: 5px;
|
102 |
+
font-size: 15px;
|
103 |
+
text-align: left;
|
104 |
+
font-family: 'Courier New', Courier, monospace;
|
105 |
+
white-space: pre;
|
106 |
+
}
|
107 |
+
|
108 |
+
.image-container-structure {
|
109 |
+
display: flex;
|
110 |
+
justify-content: center;
|
111 |
+
gap: 10px;
|
112 |
+
margin-bottom: 40px;
|
113 |
+
max-width: 70%; /* Adjust the width as needed */
|
114 |
+
margin: auto;
|
115 |
+
}
|
116 |
+
|
117 |
+
.image-container-structure a {
|
118 |
+
flex: 1;
|
119 |
+
}
|
120 |
+
|
121 |
+
.image-container-structure img {
|
122 |
+
max-width: 100%;
|
123 |
+
height: auto;
|
124 |
+
display: block;
|
125 |
+
margin: auto;
|
126 |
+
}
|
127 |
+
|
128 |
+
.image-container {
|
129 |
+
width: 100%;
|
130 |
+
margin-bottom: 40px;
|
131 |
+
}
|
132 |
+
.image-container #admin-questionnaire {
|
133 |
+
width: 50%;
|
134 |
+
height: auto;
|
135 |
+
display: block;
|
136 |
+
margin: auto;
|
137 |
+
}
|
138 |
+
.image-container #ro-image {
|
139 |
+
width: 70%;
|
140 |
+
height: auto;
|
141 |
+
display: block;
|
142 |
+
margin: auto;
|
143 |
+
}
|
144 |
+
|
145 |
+
.section-title {
|
146 |
+
font-size: 24px;
|
147 |
+
font-weight: bold;
|
148 |
+
text-align: center;
|
149 |
+
margin-bottom: 40px;
|
150 |
+
padding: 20px; /* Add padding for more margin around text */
|
151 |
+
background-color: #610b5d;
|
152 |
+
color: #fff; /* Ensure text is readable on dark background */
|
153 |
+
border-radius: 15px; /* Rounded edges */
|
154 |
+
}
|
155 |
+
.back-button {
|
156 |
+
text-align: center;
|
157 |
+
margin-top: 50px;
|
158 |
+
}
|
159 |
+
.custom-button {
|
160 |
+
background-color: #610b5d;
|
161 |
+
color: #fff; /* Set white text color */
|
162 |
+
border-radius: 15px; /* Rounded edges */
|
163 |
+
padding: 10px 20px; /* Padding for the button */
|
164 |
+
font-size: 18px; /* Increase font size */
|
165 |
+
text-decoration: none; /* Remove underline */
|
166 |
+
}
|
167 |
+
.custom-button:hover {
|
168 |
+
background-color: #812b7d;
|
169 |
+
color: #fff;
|
170 |
+
}
|
171 |
+
</style>
|
172 |
+
</head>
|
173 |
+
<body>
|
174 |
+
<div class="container">
|
175 |
+
<h1 class="mt-5">Stick To Your Role! Leaderboard</h1>
|
176 |
+
<div class="table-responsive">
|
177 |
+
<!-- Render the table HTML here -->
|
178 |
+
{{ table_html|safe }}
|
179 |
+
</div>
|
180 |
+
<div class="section">
|
181 |
+
<div class="section-title">There was an issue with your submission.</div>
|
182 |
+
<p>
|
183 |
+
Try again or contact us at <a href= "mailto: [email protected]">[email protected]</a>.
|
184 |
+
</p>
|
185 |
+
<div class="back-button">
|
186 |
+
<a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
|
187 |
+
</div>
|
188 |
+
</div>
|
189 |
+
</div>
|
190 |
+
</div>
|
191 |
+
|
192 |
+
<!-- Include jQuery -->
|
193 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
194 |
+
<!-- Include Bootstrap JS -->
|
195 |
+
<script src="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/js/bootstrap.bundle.min.js"></script>
|
196 |
+
<!-- Include DataTables JS -->
|
197 |
+
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
198 |
+
<script src="https://cdn.datatables.net/1.11.5/js/dataTables.bootstrap5.min.js"></script>
|
199 |
+
<!-- Initialize DataTables -->
|
200 |
+
<script>
|
201 |
+
$(document).ready(function() {
|
202 |
+
const table = $('table').DataTable({
|
203 |
+
"paging": false,
|
204 |
+
"info": false,
|
205 |
+
"columnDefs": [
|
206 |
+
{ "orderable": false, "targets": 0 },
|
207 |
+
{ "searchable": false, "targets": 0 }
|
208 |
+
],
|
209 |
+
"order": [[ 2, 'desc' ]],
|
210 |
+
"drawCallback": function(settings) {
|
211 |
+
var api = this.api();
|
212 |
+
api.column(0, {order:'applied'}).nodes().each(function(cell, i) {
|
213 |
+
cell.innerHTML = i + 1;
|
214 |
+
});
|
215 |
+
}
|
216 |
+
});
|
217 |
+
});
|
218 |
+
|
219 |
+
</script>
|
220 |
+
</body>
|
221 |
+
</html>
|
templates/index.html
CHANGED
@@ -41,6 +41,14 @@
|
|
41 |
text-align: left;
|
42 |
}
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
.table-responsive {
|
45 |
margin-top: 20px;
|
46 |
max-width: 1000px; /* Adjust the width as needed */
|
@@ -195,8 +203,8 @@
|
|
195 |
As proposed in our <a href="https://arxiv.org/abs/2402.14846">paper</a>,
|
196 |
unwanted context-dependence should be seen as a <b>property of LLMs</b> - a dimension of LLM comparison (alongside others such as model size speed or expressed knowledge).
|
197 |
This leaderboard aims to provide such a comparison and extends our paper with a more focused and elaborate experimental setup.
|
198 |
-
Standard benchmarks present MANY questions from the SAME MINIMAL contexts (e.g. multiple choice questions),
|
199 |
-
we present SAME questions from MANY different contexts
|
200 |
</p>
|
201 |
<div class="table-responsive main-table">
|
202 |
<!-- Render the table HTML here -->
|
@@ -238,6 +246,9 @@
|
|
238 |
<div class="about-button">
|
239 |
<a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
|
240 |
</div>
|
|
|
|
|
|
|
241 |
<div class="citation-section">
|
242 |
<p>
|
243 |
If you found this project useful, please cite our related paper,
|
@@ -253,6 +264,12 @@
|
|
253 |
}
|
254 |
</div>
|
255 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
</div>
|
257 |
|
258 |
<!-- Include jQuery -->
|
|
|
41 |
text-align: left;
|
42 |
}
|
43 |
|
44 |
+
ul {
|
45 |
+
margin: auto; /* Center the table */
|
46 |
+
margin-top: 20px;
|
47 |
+
margin-bottom: 10px;
|
48 |
+
max-width: 1000px; /* Adjust the width as needed */
|
49 |
+
text-align: left;
|
50 |
+
}
|
51 |
+
|
52 |
.table-responsive {
|
53 |
margin-top: 20px;
|
54 |
max-width: 1000px; /* Adjust the width as needed */
|
|
|
203 |
As proposed in our <a href="https://arxiv.org/abs/2402.14846">paper</a>,
|
204 |
unwanted context-dependence should be seen as a <b>property of LLMs</b> - a dimension of LLM comparison (alongside others such as model size speed or expressed knowledge).
|
205 |
This leaderboard aims to provide such a comparison and extends our paper with a more focused and elaborate experimental setup.
|
206 |
+
Standard benchmarks present <b>MANY</b> questions from the <b>SAME MINIMAL contexts</b> (e.g. multiple choice questions),
|
207 |
+
we present <b>SAME</b> questions from <b>MANY different contexts</b>.
|
208 |
</p>
|
209 |
<div class="table-responsive main-table">
|
210 |
<!-- Render the table HTML here -->
|
|
|
246 |
<div class="about-button">
|
247 |
<a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
|
248 |
</div>
|
249 |
+
<div class="about-button">
|
250 |
+
<a href="{{ url_for('new_model') }}" class="custom-button mt-3">Submit a model</a>
|
251 |
+
</div>
|
252 |
<div class="citation-section">
|
253 |
<p>
|
254 |
If you found this project useful, please cite our related paper,
|
|
|
264 |
}
|
265 |
</div>
|
266 |
</div>
|
267 |
+
<ul>
|
268 |
+
<li>Contact: <a href="mailto: [email protected]">[email protected]</a></li>
|
269 |
+
<li>See the <a href="https://sites.google.com/view/llmvaluestability">Project website<a/></li>
|
270 |
+
<li>See the Flowers team <a href="http://developmentalsystems.org">blog</a> and <a href="https://flowers.inria.fr/">website</a></li>
|
271 |
+
<li>See Grgur's website and other projects: <a href="https://grgkovac.github.io/">https://grgkovac.github.io/</a></li>
|
272 |
+
</ul>
|
273 |
</div>
|
274 |
|
275 |
<!-- Include jQuery -->
|
templates/model_detail.html
CHANGED
@@ -140,7 +140,7 @@
|
|
140 |
</div>
|
141 |
</div>
|
142 |
<div class="back-button">
|
143 |
-
<a href="{{ url_for('index') }}" class="custom-button mt-3">
|
144 |
</div>
|
145 |
</div>
|
146 |
|
|
|
140 |
</div>
|
141 |
</div>
|
142 |
<div class="back-button">
|
143 |
+
<a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
|
144 |
</div>
|
145 |
</div>
|
146 |
|
templates/model_submitted.html
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Stick To Your Role! About</title>
|
7 |
+
<!-- Include Bootstrap CSS for styling -->
|
8 |
+
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/css/bootstrap.min.css">
|
9 |
+
<!-- Include DataTables CSS -->
|
10 |
+
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/dataTables.bootstrap5.min.css">
|
11 |
+
<!-- Custom CSS for additional styling -->
|
12 |
+
<style>
|
13 |
+
body {
|
14 |
+
background-color: #f8f9fa;
|
15 |
+
font-family: 'Arial', sans-serif;
|
16 |
+
}
|
17 |
+
.container {
|
18 |
+
max-width: 1200px; /* Limit the width of the container */
|
19 |
+
margin: auto; /* Center the container */
|
20 |
+
padding: 20px; /* Add some padding */
|
21 |
+
background: #fff;
|
22 |
+
border-radius: 8px;
|
23 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
24 |
+
}
|
25 |
+
h1 {
|
26 |
+
color: #333;
|
27 |
+
text-align: center;
|
28 |
+
}
|
29 |
+
h2 {
|
30 |
+
color: #333;
|
31 |
+
margin-top: 30px;
|
32 |
+
text-align: center;
|
33 |
+
}
|
34 |
+
.table-responsive {
|
35 |
+
margin-top: 20px;
|
36 |
+
}
|
37 |
+
table {
|
38 |
+
border-collapse: separate;
|
39 |
+
border-spacing: 0;
|
40 |
+
font-size: 14px; /* Reduce the font size */
|
41 |
+
width: 100%;
|
42 |
+
border: none; /* Remove any default border */
|
43 |
+
}
|
44 |
+
table thead th {
|
45 |
+
background-color: #610b5d;
|
46 |
+
color: white;
|
47 |
+
border: 1px solid #dee2e6;
|
48 |
+
text-align: left;
|
49 |
+
}
|
50 |
+
table tbody tr {
|
51 |
+
background-color: #fff;
|
52 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
53 |
+
}
|
54 |
+
table tbody tr:hover {
|
55 |
+
background-color: #f1f1f1;
|
56 |
+
}
|
57 |
+
table td, table th {
|
58 |
+
padding: 10px; /* Reduce padding */
|
59 |
+
border: 1px solid #dee2e6;
|
60 |
+
}
|
61 |
+
table th:first-child {
|
62 |
+
border-top-left-radius: 10px;
|
63 |
+
}
|
64 |
+
table th:last-child {
|
65 |
+
border-top-right-radius: 10px;
|
66 |
+
}
|
67 |
+
.section{
|
68 |
+
padding-top: 19px;
|
69 |
+
text-align: left;
|
70 |
+
}
|
71 |
+
|
72 |
+
.section p {
|
73 |
+
padding-left: 150px;
|
74 |
+
padding-right: 150px;
|
75 |
+
text-indent: 2em;
|
76 |
+
margin: auto;
|
77 |
+
margin-bottom: 10px;
|
78 |
+
text-align: left;
|
79 |
+
}
|
80 |
+
|
81 |
+
.section ol, ul {
|
82 |
+
padding-left: 150px;
|
83 |
+
padding-right: 150px;
|
84 |
+
margin: auto;
|
85 |
+
margin-bottom: 20px;
|
86 |
+
margin-left: 50px;
|
87 |
+
text-align: left;
|
88 |
+
margin-top: 0px;
|
89 |
+
}
|
90 |
+
|
91 |
+
.citation-section {
|
92 |
+
width: 100%;
|
93 |
+
margin-top: 50px;
|
94 |
+
text-align: center;
|
95 |
+
}
|
96 |
+
.citation-box {
|
97 |
+
background-color: #f8f9fa;
|
98 |
+
border: 1px solid #dee2e6;
|
99 |
+
border-radius: 8px;
|
100 |
+
padding: 10px;
|
101 |
+
margin-top: 5px;
|
102 |
+
font-size: 15px;
|
103 |
+
text-align: left;
|
104 |
+
font-family: 'Courier New', Courier, monospace;
|
105 |
+
white-space: pre;
|
106 |
+
}
|
107 |
+
|
108 |
+
.image-container-structure {
|
109 |
+
display: flex;
|
110 |
+
justify-content: center;
|
111 |
+
gap: 10px;
|
112 |
+
margin-bottom: 40px;
|
113 |
+
max-width: 70%; /* Adjust the width as needed */
|
114 |
+
margin: auto;
|
115 |
+
}
|
116 |
+
|
117 |
+
.image-container-structure a {
|
118 |
+
flex: 1;
|
119 |
+
}
|
120 |
+
|
121 |
+
.image-container-structure img {
|
122 |
+
max-width: 100%;
|
123 |
+
height: auto;
|
124 |
+
display: block;
|
125 |
+
margin: auto;
|
126 |
+
}
|
127 |
+
|
128 |
+
.image-container {
|
129 |
+
width: 100%;
|
130 |
+
margin-bottom: 40px;
|
131 |
+
}
|
132 |
+
.image-container #admin-questionnaire {
|
133 |
+
width: 50%;
|
134 |
+
height: auto;
|
135 |
+
display: block;
|
136 |
+
margin: auto;
|
137 |
+
}
|
138 |
+
.image-container #ro-image {
|
139 |
+
width: 70%;
|
140 |
+
height: auto;
|
141 |
+
display: block;
|
142 |
+
margin: auto;
|
143 |
+
}
|
144 |
+
|
145 |
+
.section-title {
|
146 |
+
font-size: 24px;
|
147 |
+
font-weight: bold;
|
148 |
+
text-align: center;
|
149 |
+
margin-bottom: 40px;
|
150 |
+
padding: 20px; /* Add padding for more margin around text */
|
151 |
+
background-color: #610b5d;
|
152 |
+
color: #fff; /* Ensure text is readable on dark background */
|
153 |
+
border-radius: 15px; /* Rounded edges */
|
154 |
+
}
|
155 |
+
.back-button {
|
156 |
+
text-align: center;
|
157 |
+
margin-top: 50px;
|
158 |
+
}
|
159 |
+
.custom-button {
|
160 |
+
background-color: #610b5d;
|
161 |
+
color: #fff; /* Set white text color */
|
162 |
+
border-radius: 15px; /* Rounded edges */
|
163 |
+
padding: 10px 20px; /* Padding for the button */
|
164 |
+
font-size: 18px; /* Increase font size */
|
165 |
+
text-decoration: none; /* Remove underline */
|
166 |
+
}
|
167 |
+
.custom-button:hover {
|
168 |
+
background-color: #812b7d;
|
169 |
+
color: #fff;
|
170 |
+
}
|
171 |
+
</style>
|
172 |
+
</head>
|
173 |
+
<body>
|
174 |
+
<div class="container">
|
175 |
+
<h1 class="mt-5">Stick To Your Role! Leaderboard</h1>
|
176 |
+
<div class="table-responsive">
|
177 |
+
<!-- Render the table HTML here -->
|
178 |
+
{{ table_html|safe }}
|
179 |
+
</div>
|
180 |
+
<div class="section">
|
181 |
+
<div class="section-title">Thank you for submitting your model!</div>
|
182 |
+
<p>
|
183 |
+
We will get back to you to confirm the reception of the model.
|
184 |
+
If we do not get back to you in the period of two weeks please contact us at:
|
185 |
+
<a href= "mailto: [email protected]">[email protected]</a>.
|
186 |
+
</p>
|
187 |
+
<div class="back-button">
|
188 |
+
<a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
|
189 |
+
</div>
|
190 |
+
</div>
|
191 |
+
</div>
|
192 |
+
</div>
|
193 |
+
|
194 |
+
<!-- Include jQuery -->
|
195 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
196 |
+
<!-- Include Bootstrap JS -->
|
197 |
+
<script src="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/js/bootstrap.bundle.min.js"></script>
|
198 |
+
<!-- Include DataTables JS -->
|
199 |
+
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
200 |
+
<script src="https://cdn.datatables.net/1.11.5/js/dataTables.bootstrap5.min.js"></script>
|
201 |
+
<!-- Initialize DataTables -->
|
202 |
+
<script>
|
203 |
+
$(document).ready(function() {
|
204 |
+
const table = $('table').DataTable({
|
205 |
+
"paging": false,
|
206 |
+
"info": false,
|
207 |
+
"columnDefs": [
|
208 |
+
{ "orderable": false, "targets": 0 },
|
209 |
+
{ "searchable": false, "targets": 0 }
|
210 |
+
],
|
211 |
+
"order": [[ 2, 'desc' ]],
|
212 |
+
"drawCallback": function(settings) {
|
213 |
+
var api = this.api();
|
214 |
+
api.column(0, {order:'applied'}).nodes().each(function(cell, i) {
|
215 |
+
cell.innerHTML = i + 1;
|
216 |
+
});
|
217 |
+
}
|
218 |
+
});
|
219 |
+
});
|
220 |
+
|
221 |
+
</script>
|
222 |
+
</body>
|
223 |
+
</html>
|
templates/new_model.html
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Stick To Your Role! About</title>
|
7 |
+
<!-- Include Bootstrap CSS for styling -->
|
8 |
+
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/css/bootstrap.min.css">
|
9 |
+
<!-- Include DataTables CSS -->
|
10 |
+
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/dataTables.bootstrap5.min.css">
|
11 |
+
<!-- Custom CSS for additional styling -->
|
12 |
+
<style>
|
13 |
+
body {
|
14 |
+
background-color: #f8f9fa;
|
15 |
+
font-family: 'Arial', sans-serif;
|
16 |
+
}
|
17 |
+
.container {
|
18 |
+
max-width: 1200px; /* Limit the width of the container */
|
19 |
+
margin: auto; /* Center the container */
|
20 |
+
padding: 20px; /* Add some padding */
|
21 |
+
background: #fff;
|
22 |
+
border-radius: 8px;
|
23 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
24 |
+
}
|
25 |
+
h1 {
|
26 |
+
color: #333;
|
27 |
+
text-align: center;
|
28 |
+
}
|
29 |
+
h2 {
|
30 |
+
color: #333;
|
31 |
+
margin-top: 30px;
|
32 |
+
text-align: center;
|
33 |
+
}
|
34 |
+
|
35 |
+
.section {
|
36 |
+
padding-top: 19px;
|
37 |
+
text-align: left;
|
38 |
+
}
|
39 |
+
|
40 |
+
.section p {
|
41 |
+
padding-left: 150px;
|
42 |
+
padding-right: 150px;
|
43 |
+
text-indent: 2em;
|
44 |
+
margin: auto;
|
45 |
+
margin-bottom: 10px;
|
46 |
+
text-align: left;
|
47 |
+
}
|
48 |
+
|
49 |
+
.section ol, ul {
|
50 |
+
padding-left: 150px;
|
51 |
+
padding-right: 150px;
|
52 |
+
margin: auto;
|
53 |
+
margin-bottom: 20px;
|
54 |
+
margin-left: 50px;
|
55 |
+
text-align: left;
|
56 |
+
margin-top: 0px;
|
57 |
+
}
|
58 |
+
|
59 |
+
.citation-section {
|
60 |
+
width: 100%;
|
61 |
+
margin-top: 50px;
|
62 |
+
text-align: center;
|
63 |
+
}
|
64 |
+
.citation-box {
|
65 |
+
background-color: #f8f9fa;
|
66 |
+
border: 1px solid #dee2e6;
|
67 |
+
border-radius: 8px;
|
68 |
+
padding: 10px;
|
69 |
+
margin-top: 5px;
|
70 |
+
font-size: 15px;
|
71 |
+
text-align: left;
|
72 |
+
font-family: 'Courier New', Courier, monospace;
|
73 |
+
white-space: pre;
|
74 |
+
}
|
75 |
+
|
76 |
+
.image-container-structure {
|
77 |
+
display: flex;
|
78 |
+
justify-content: center;
|
79 |
+
gap: 10px;
|
80 |
+
margin-bottom: 40px;
|
81 |
+
max-width: 70%; /* Adjust the width as needed */
|
82 |
+
margin: auto;
|
83 |
+
}
|
84 |
+
|
85 |
+
.image-container-structure a {
|
86 |
+
flex: 1;
|
87 |
+
}
|
88 |
+
|
89 |
+
.image-container-structure img {
|
90 |
+
max-width: 100%;
|
91 |
+
height: auto;
|
92 |
+
display: block;
|
93 |
+
margin: auto;
|
94 |
+
}
|
95 |
+
|
96 |
+
.image-container {
|
97 |
+
width: 100%;
|
98 |
+
margin-bottom: 40px;
|
99 |
+
}
|
100 |
+
.image-container #admin-questionnaire {
|
101 |
+
width: 50%;
|
102 |
+
height: auto;
|
103 |
+
display: block;
|
104 |
+
margin: auto;
|
105 |
+
}
|
106 |
+
.image-container #ro-image {
|
107 |
+
width: 70%;
|
108 |
+
height: auto;
|
109 |
+
display: block;
|
110 |
+
margin: auto;
|
111 |
+
}
|
112 |
+
|
113 |
+
.section-title {
|
114 |
+
font-size: 24px;
|
115 |
+
font-weight: bold;
|
116 |
+
text-align: center;
|
117 |
+
margin-bottom: 40px;
|
118 |
+
padding: 20px; /* Add padding for more margin around text */
|
119 |
+
background-color: #610b5d;
|
120 |
+
color: #fff; /* Ensure text is readable on dark background */
|
121 |
+
border-radius: 15px; /* Rounded edges */
|
122 |
+
}
|
123 |
+
.back-button {
|
124 |
+
text-align: center;
|
125 |
+
margin-top: 50px;
|
126 |
+
}
|
127 |
+
.custom-button {
|
128 |
+
background-color: #610b5d;
|
129 |
+
color: #fff; /* Set white text color */
|
130 |
+
border-radius: 15px; /* Rounded edges */
|
131 |
+
padding: 10px 20px; /* Padding for the button */
|
132 |
+
font-size: 18px; /* Increase font size */
|
133 |
+
text-decoration: none; /* Remove underline */
|
134 |
+
}
|
135 |
+
.custom-button:hover {
|
136 |
+
background-color: #812b7d;
|
137 |
+
color: #fff;
|
138 |
+
}
|
139 |
+
.form-container {
|
140 |
+
max-width: 80%; /* Adjust as needed */
|
141 |
+
margin: 20px 100px; /* Center horizontally */
|
142 |
+
padding: 50px 150px;
|
143 |
+
text-align: center;
|
144 |
+
background-color: #f8f9fa;
|
145 |
+
}
|
146 |
+
|
147 |
+
.form-row {
|
148 |
+
max-width: 100%;
|
149 |
+
margin-bottom: 20px;
|
150 |
+
text-align: left;
|
151 |
+
}
|
152 |
+
|
153 |
+
.form-label {
|
154 |
+
}
|
155 |
+
|
156 |
+
.col-md-4 {
|
157 |
+
width: 100%
|
158 |
+
}
|
159 |
+
.col-md-8 {
|
160 |
+
width: 100%
|
161 |
+
}
|
162 |
+
|
163 |
+
.form-content {
|
164 |
+
margin-bottom: 15px;
|
165 |
+
min-width: 100%;
|
166 |
+
}
|
167 |
+
.form-content::placeholder {
|
168 |
+
color: #aaa;
|
169 |
+
font-style: italic;
|
170 |
+
}
|
171 |
+
|
172 |
+
.file-input {
|
173 |
+
margin-top: 10px;
|
174 |
+
}
|
175 |
+
</style>
|
176 |
+
</head>
|
177 |
+
<body>
|
178 |
+
<div class="container">
|
179 |
+
<h1 class="mt-5">Stick To Your Role! Leaderboard</h1>
|
180 |
+
<div class="table-responsive">
|
181 |
+
<!-- Render the table HTML here -->
|
182 |
+
{{ table_html|safe }}
|
183 |
+
</div>
|
184 |
+
<div class="section">
|
185 |
+
<div id="evaluate_custom_model" class="section-title">Evaluate a custom model</div>
|
186 |
+
<p>
|
187 |
+
To evaluate a custom model you can use our <a href="https://gitlab.inria.fr/gkovac/value_stability">open-source code</a>.
|
188 |
+
If a model is in the huggingface transformers format (saved either localy or on the hub),
|
189 |
+
it can be simply added by adding a config file.
|
190 |
+
The model can then be evaluated as any other model.
|
191 |
+
To do so, follow the <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/README.md?ref_type=heads#adding-a-new-model">instructions</a> in the README.md file.
|
192 |
+
</p>
|
193 |
+
</div>
|
194 |
+
<div class="section" id="paper">
|
195 |
+
<div class="section-title">Submit a custom model to the Stick To Your Role! Leaderboard</div>
|
196 |
+
<p>
|
197 |
+
If you want, your model can be to the Stick To Your Role! Leaderboard, as an unofficial submission.
|
198 |
+
A separate list of models containing both official and unofficial submissions will be created.
|
199 |
+
The procedure is as follows:
|
200 |
+
</p>
|
201 |
+
<ol>
|
202 |
+
<li>
|
203 |
+
<b> Add and evaluate your model </b> - Add your model as a config file as described <a href="{{ url_for('new_model', _anchor='evaluate_custom_model') }}">above</a>.
|
204 |
+
This procedure should result in 9 json files as such:
|
205 |
+
<code>`Leaderboard/results/stability_leaderboard/<your_model_name>/chunk_0_<timestamp>/results.json`</code>
|
206 |
+
</li>
|
207 |
+
<li>
|
208 |
+
<b> Submit the config file </b> - Create a pull request to our <a href="https://gitlab.inria.fr/gkovac/value_stability">repository</a> from a branch <code>"unofficial_model/<your_model_name>"</code>.
|
209 |
+
The pull request should ideally only add the config file in <code>`./models/leaderboard_configs`</code>.
|
210 |
+
If additional changes are needed, they should ideally be constrained to a new model class (see <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/models/huggingfacemodel.py?ref_type=heads">huggingfacemodel.py</a> for reference).
|
211 |
+
<li>
|
212 |
+
<b> Submit the model results </b> - submit the *json files as a ZIP using the form below.
|
213 |
+
We will integrate the model's results on our side, and rerank models with yours included.
|
214 |
+
</li>
|
215 |
+
</ol>
|
216 |
+
<div class="form-container">
|
217 |
+
<form id="model-submission-form" method="POST" action="{{ url_for('submit_model') }}" enctype="multipart/form-data">
|
218 |
+
<div class="form-row row">
|
219 |
+
<div class="col-md-4">
|
220 |
+
<label for="model_name" class="form-label">Model Name:</label>
|
221 |
+
</div>
|
222 |
+
<div class="col-md-8">
|
223 |
+
<input type="text" class="form-content" id="model_name" name="model_name" required>
|
224 |
+
</div>
|
225 |
+
</div>
|
226 |
+
<div class="form-row row">
|
227 |
+
<div class="col-md-4">
|
228 |
+
<label for="pull_request_link" class="form-label">Pull Request Link:</label>
|
229 |
+
</div>
|
230 |
+
<div class="col-md-8">
|
231 |
+
<input type="url" class="form-content" id="pull_request_link" name="pull_request_link" required>
|
232 |
+
</div>
|
233 |
+
</div>
|
234 |
+
<div class="form-row row">
|
235 |
+
<div class="col-md-4">
|
236 |
+
<label for="email" class="form-label">Email:</label>
|
237 |
+
</div>
|
238 |
+
<div class="col-md-8">
|
239 |
+
<input type="email" class="form-content" id="email" name="email" required>
|
240 |
+
</div>
|
241 |
+
</div>
|
242 |
+
<div class="form-row row">
|
243 |
+
<div class="col-md-4">
|
244 |
+
<label for="description" class="form-label">Description:</label>
|
245 |
+
</div>
|
246 |
+
<div class="col-md-8">
|
247 |
+
<textarea class="form-content" id="description" name="description" placeholder="Various details on the model training and architecture (e.g. dataset, model size, optimizer, etc.)" rows="3" required></textarea>
|
248 |
+
</div>
|
249 |
+
</div>
|
250 |
+
<div class="form-row row">
|
251 |
+
<div class="col-md-4">
|
252 |
+
<label for="model_files" class="form-label">
|
253 |
+
Upload the Model results directory as a ZIP file
|
254 |
+
(<code>Leaderboard/results/stability_leaderboard/<your_model_name></code>):
|
255 |
+
</label>
|
256 |
+
</div>
|
257 |
+
<div class="col-md-8">
|
258 |
+
<input type="file" id="model_files" name="model_files" class="file-input" accept=".zip" required>
|
259 |
+
<small class="form-text text-muted">
|
260 |
+
Please upload a ZIP file containing the results directory.
|
261 |
+
</small>
|
262 |
+
</div>
|
263 |
+
</div>
|
264 |
+
<button type="submit" class="btn custom-button mt-3">Submit</button>
|
265 |
+
</form>
|
266 |
+
</div>
|
267 |
+
</div>
|
268 |
+
<div class="back-button">
|
269 |
+
<a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
|
270 |
+
</div>
|
271 |
+
<div class="citation-section">
|
272 |
+
<p>If you found this project useful, please cite our related paper:</p>
|
273 |
+
<div class="citation-box" id="citation-text">
|
274 |
+
@article{kovavc2024stick,
|
275 |
+
title={Stick to your Role! Stability of Personal Values Expressed in Large Language Models},
|
276 |
+
author={Kova{\v{c}}, Grgur and Portelas, R{\'e}my and Sawayama, Masataka and Dominey, Peter Ford and Oudeyer, Pierre-Yves},
|
277 |
+
journal={arXiv preprint arXiv:2402.14846},
|
278 |
+
year={2024}
|
279 |
+
}
|
280 |
+
</div>
|
281 |
+
</div>
|
282 |
+
</div>
|
283 |
+
|
284 |
+
<!-- Include jQuery -->
|
285 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
286 |
+
<!-- Include Bootstrap JS -->
|
287 |
+
<script src="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/js/bootstrap.bundle.min.js"></script>
|
288 |
+
<!-- Include DataTables JS -->
|
289 |
+
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
290 |
+
<script src="https://cdn.datatables.net/1.11.5/js/dataTables.bootstrap5.min.js"></script>
|
291 |
+
<!-- Initialize DataTables -->
|
292 |
+
<script>
|
293 |
+
$(document).ready(function() {
|
294 |
+
const table = $('table').DataTable({
|
295 |
+
"paging": false,
|
296 |
+
"info": false,
|
297 |
+
"columnDefs": [
|
298 |
+
{ "orderable": false, "targets": 0 },
|
299 |
+
{ "searchable": false, "targets": 0 }
|
300 |
+
],
|
301 |
+
"order": [[ 2, 'desc' ]],
|
302 |
+
"drawCallback": function(settings) {
|
303 |
+
var api = this.api();
|
304 |
+
api.column(0, {order:'applied'}).nodes().each(function(cell, i) {
|
305 |
+
cell.innerHTML = i + 1;
|
306 |
+
});
|
307 |
+
}
|
308 |
+
});
|
309 |
+
});
|
310 |
+
</script>
|
311 |
+
</body>
|
312 |
+
</html>
|