Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
95cc038
1
Parent(s):
62679c8
update
Browse files- cli/averitec-upload-cli.py +12 -0
- cli/halueval-cli.py +11 -2
- cli/submit-cli.py +7 -2
cli/averitec-upload-cli.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
from datasets import load_dataset
|
4 |
+
|
5 |
+
path = 'pminervini/averitec'
|
6 |
+
|
7 |
+
ds = load_dataset("json",
|
8 |
+
data_files={
|
9 |
+
'train': '/Users/pasquale/workspace/AVeriTeC/data/train.json',
|
10 |
+
'dev': '/Users/pasquale/workspace/AVeriTeC/data/dev.json'
|
11 |
+
})
|
12 |
+
ds.push_to_hub(path)
|
cli/halueval-cli.py
CHANGED
@@ -33,7 +33,13 @@ def main():
|
|
33 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
34 |
|
35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
36 |
-
my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
TASKS_HARNESS = [my_task]
|
39 |
# task_names = ['triviaqa']
|
@@ -48,7 +54,10 @@ def main():
|
|
48 |
|
49 |
for task in TASKS_HARNESS:
|
50 |
print(f"Selected Tasks: [{task}]")
|
51 |
-
|
|
|
|
|
|
|
52 |
batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
|
53 |
print('AAA', results["results"])
|
54 |
|
|
|
33 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
34 |
|
35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
36 |
+
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
+
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
+
my_task = Task("fever10", "acc", "FEVER", 5)
|
39 |
+
|
40 |
+
eval_logger = utils.eval_logger
|
41 |
+
import logging
|
42 |
+
eval_logger.setLevel(getattr(logging, "DEBUG"))
|
43 |
|
44 |
TASKS_HARNESS = [my_task]
|
45 |
# task_names = ['triviaqa']
|
|
|
54 |
|
55 |
for task in TASKS_HARNESS:
|
56 |
print(f"Selected Tasks: [{task}]")
|
57 |
+
import torch
|
58 |
+
|
59 |
+
# breakpoint()
|
60 |
+
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
61 |
batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
|
62 |
print('AAA', results["results"])
|
63 |
|
cli/submit-cli.py
CHANGED
@@ -120,7 +120,10 @@ def main():
|
|
120 |
model_lst = [m for m in model_lst]
|
121 |
|
122 |
def custom_filter(m) -> bool:
|
123 |
-
|
|
|
|
|
|
|
124 |
|
125 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
126 |
|
@@ -138,6 +141,8 @@ def main():
|
|
138 |
|
139 |
requested_model_names = {e.model for e in eval_requests}
|
140 |
|
|
|
|
|
141 |
for i in range(min(200, len(filtered_model_lst))):
|
142 |
model = filtered_model_lst[i]
|
143 |
|
@@ -157,7 +162,7 @@ def main():
|
|
157 |
|
158 |
if 'mage' not in model.id:
|
159 |
add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
|
160 |
-
time.sleep(
|
161 |
else:
|
162 |
print(f'Model {model.id} already added, not adding it to the queue again.')
|
163 |
|
|
|
120 |
model_lst = [m for m in model_lst]
|
121 |
|
122 |
def custom_filter(m) -> bool:
|
123 |
+
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
|
124 |
+
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
|
125 |
+
res = 'mistralai/' in m.id
|
126 |
+
return res
|
127 |
|
128 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
129 |
|
|
|
141 |
|
142 |
requested_model_names = {e.model for e in eval_requests}
|
143 |
|
144 |
+
breakpoint()
|
145 |
+
|
146 |
for i in range(min(200, len(filtered_model_lst))):
|
147 |
model = filtered_model_lst[i]
|
148 |
|
|
|
162 |
|
163 |
if 'mage' not in model.id:
|
164 |
add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
|
165 |
+
time.sleep(10)
|
166 |
else:
|
167 |
print(f'Model {model.id} already added, not adding it to the queue again.')
|
168 |
|