Spaces:
Paused
Paused
Added ilfacts task (not for eval yet) and updated backend with fallback
Browse files- custom_tasks.py +2 -1
- src/backend/run_eval_suite_lighteval.py +75 -62
- src/custom_tasks/ilfacts_task.py +56 -0
custom_tasks.py
CHANGED
@@ -11,11 +11,12 @@ from src.custom_tasks.sentiment_task import *
|
|
11 |
from src.custom_tasks.winograd_task import *
|
12 |
from src.custom_tasks.translation_task import *
|
13 |
from src.custom_tasks.snli_task import *
|
|
|
14 |
|
15 |
## MODULE LOGIC
|
16 |
# You should not need to touch this
|
17 |
# Convert to dict for lighteval
|
18 |
-
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task]]
|
19 |
|
20 |
if __name__ == "__main__":
|
21 |
print(t["name"] for t in TASKS_TABLE)
|
|
|
11 |
from src.custom_tasks.winograd_task import *
|
12 |
from src.custom_tasks.translation_task import *
|
13 |
from src.custom_tasks.snli_task import *
|
14 |
+
from src.custom_tasks.ilfacts_task import *
|
15 |
|
16 |
## MODULE LOGIC
|
17 |
# You should not need to touch this
|
18 |
# Convert to dict for lighteval
|
19 |
+
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task, ilfacts_task]]
|
20 |
|
21 |
if __name__ == "__main__":
|
22 |
print(t["name"] for t in TASKS_TABLE)
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
from argparse import Namespace
|
|
|
6 |
|
7 |
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
8 |
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
|
@@ -10,6 +11,7 @@ from src.backend.manage_requests import EvalRequest
|
|
10 |
from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
|
11 |
from lighteval.models.model_loader import ModelInfo
|
12 |
from huggingface_hub.errors import InferenceEndpointTimeoutError
|
|
|
13 |
|
14 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
15 |
|
@@ -21,70 +23,81 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
21 |
if limit:
|
22 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
try:
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
args.model_config['model']['base_params']['reuse_existing'] = True
|
75 |
|
76 |
-
|
77 |
-
print(dumped)
|
78 |
-
except Exception as ex: # if eval failed, we force a cleanup
|
79 |
-
import traceback
|
80 |
-
traceback.print_exception(ex)
|
81 |
-
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
82 |
-
args.model_config['model']['base_params']['reuse_existing'] = True
|
83 |
-
model_config = create_model_config(args=args, accelerator=accelerator)
|
84 |
-
model, _ = load_model(config=model_config, env_config=env_config)
|
85 |
-
print('Cleaning up')
|
86 |
-
model.reuse_existing = False # force it to clean up
|
87 |
-
model.cleanup()
|
88 |
-
results = None
|
89 |
|
90 |
return results
|
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
from argparse import Namespace
|
6 |
+
import traceback
|
7 |
|
8 |
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
9 |
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
|
|
|
11 |
from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
|
12 |
from lighteval.models.model_loader import ModelInfo
|
13 |
from huggingface_hub.errors import InferenceEndpointTimeoutError
|
14 |
+
from huggingface_hub import HfApi
|
15 |
|
16 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
17 |
|
|
|
23 |
if limit:
|
24 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
25 |
|
26 |
+
api = HfApi(token=TOKEN)
|
27 |
+
|
28 |
+
completed = False
|
29 |
+
for img_version in ['1.4.5', '2.0.3']:
|
30 |
+
args = DefaultNamespace(**{
|
31 |
+
"model_config": dict(model=dict(
|
32 |
+
type="endpoint",
|
33 |
+
base_params=dict(
|
34 |
+
endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
|
35 |
+
model=eval_request.model,
|
36 |
+
revision=eval_request.revision,
|
37 |
+
dtype=eval_request.precision,
|
38 |
+
reuse_existing=False
|
39 |
+
),
|
40 |
+
instance=dict(
|
41 |
+
accelerator=accelerator,
|
42 |
+
region=region,
|
43 |
+
vendor=vendor,
|
44 |
+
instance_size=instance_size,
|
45 |
+
instance_type=instance_type,
|
46 |
+
framework='pytorch',
|
47 |
+
endpoint_type='protected',
|
48 |
+
namespace=OWNER,
|
49 |
+
image_url='ghcr.io/huggingface/text-generation-inference:' + img_version
|
50 |
+
),
|
51 |
+
generation=dict(
|
52 |
+
add_special_tokens=True
|
53 |
+
)
|
54 |
+
)),
|
55 |
+
"max_samples": limit,
|
56 |
+
"job_id": str(datetime.now()),
|
57 |
+
"push_results_to_hub": True,
|
58 |
+
"save_details": False,
|
59 |
+
"push_details_to_hub": False,
|
60 |
+
"public_run": False,
|
61 |
+
"cache_dir": CACHE_PATH,
|
62 |
+
"results_org": OWNER,
|
63 |
+
"output_dir": local_dir,
|
64 |
+
"override_batch_size": batch_size,
|
65 |
+
"custom_tasks": "custom_tasks.py",
|
66 |
+
"tasks": task_names,
|
67 |
+
"dataset_loading_processes": 24,
|
68 |
+
"num_fewshot_seeds": 0
|
69 |
+
})
|
70 |
+
|
71 |
+
|
72 |
+
try:
|
73 |
+
# in case of timeout, try it again with reuse_existing
|
74 |
+
for i in range(3):
|
75 |
+
try:
|
76 |
+
results = main(args)
|
77 |
+
completed = True # success!
|
78 |
|
79 |
+
dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
|
80 |
+
print(dumped)
|
81 |
+
|
82 |
+
# if we are i>0, then raise an error so that we call clean up
|
83 |
+
if i > 0: raise Exception()
|
84 |
+
break # no need to loop twice if we completed
|
85 |
+
except InferenceEndpointTimeoutError:
|
86 |
+
if i < 3:
|
87 |
+
print('Timed out, trying again...')
|
88 |
+
args.model_config['model']['base_params']['reuse_existing'] = True
|
89 |
+
# loop around and try again, for timeout
|
90 |
+
|
91 |
+
except Exception as ex: # if eval failed, we force a cleanup
|
92 |
+
traceback.print_exception(ex)
|
93 |
try:
|
94 |
+
api.delete_inference_endpoint(
|
95 |
+
name=args.model_config['model']['base_params']['endpoint_name'],
|
96 |
+
namespace=args.model_config['model']['instance']['namespace']
|
97 |
+
)
|
98 |
+
except Exception as ex:
|
99 |
+
traceback.print_exception(ex)
|
|
|
100 |
|
101 |
+
if completed: break # no need to try with a different image version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
return results
|
src/custom_tasks/ilfacts_task.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
|
13 |
+
def ilfacts_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
14 |
+
if len(predictions) > 1:
|
15 |
+
raise ValueError("Predictions should have one item")
|
16 |
+
# do some santizations, since some models produce more info
|
17 |
+
pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
|
18 |
+
return 1 if pred == golds[0] else 0
|
19 |
+
|
20 |
+
ilfacts_acc_metric = CorpusLevelMetric(
|
21 |
+
metric="ilfacts_acc",
|
22 |
+
higher_is_better=True,
|
23 |
+
category=MetricCategory.GENERATIVE,
|
24 |
+
use_case=MetricUseCase.ACCURACY,
|
25 |
+
corpus_level_fn=np.mean,
|
26 |
+
sample_level_fn=ilfacts_eval_fn
|
27 |
+
)
|
28 |
+
extend_enum(Metrics, 'ilfacts_acc_metric', ilfacts_acc_metric)
|
29 |
+
|
30 |
+
def ilfacts_prompt_fn(line, task_name: str = None):
|
31 |
+
"""Defines how to go from a dataset line to a doc object.
|
32 |
+
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
33 |
+
about what this function should do in the README.
|
34 |
+
"""
|
35 |
+
return Doc(
|
36 |
+
task_name=task_name,
|
37 |
+
query=line["prompt"].strip(),
|
38 |
+
choices=[resp.strip() for resp in line["response"]],
|
39 |
+
gold_index=0,
|
40 |
+
instruction="",
|
41 |
+
)
|
42 |
+
|
43 |
+
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
44 |
+
# attached to it, and one evaluation possible.
|
45 |
+
ilfacts_task = LightevalTaskConfig(
|
46 |
+
name="ilfacts-acc",
|
47 |
+
prompt_function="ilfacts_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
48 |
+
suite=["custom"],
|
49 |
+
hf_repo="hebrew-llm-leaderboard/tests",
|
50 |
+
hf_subset="default",
|
51 |
+
hf_avail_splits=["ilfacts"],
|
52 |
+
evaluation_splits=["ilfacts"],
|
53 |
+
metric=['ilfacts_acc_metric'],
|
54 |
+
stop_sequence=['\n'],
|
55 |
+
generation_size=48
|
56 |
+
)
|