meg-huggingface
commited on
Commit
•
4f55f5f
1
Parent(s):
798ff9d
Deleting logging that we're not using
Browse files- custom_tasks.py +0 -90
- main_backend_harness.py +0 -80
- main_backend_lighteval.py +0 -91
- src/backend/run_eval_suite_harness.py +0 -118
- src/backend/run_eval_suite_lighteval.py +0 -88
- src/backend/run_toxicity_eval.py +0 -2
custom_tasks.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
# ruff: noqa: F405, F403, F401
|
2 |
-
"""
|
3 |
-
Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
|
4 |
-
|
5 |
-
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
6 |
-
|
7 |
-
Author:
|
8 |
-
"""
|
9 |
-
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
10 |
-
from lighteval.tasks.requests import Doc
|
11 |
-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
12 |
-
|
13 |
-
|
14 |
-
## EVAL WITH NO SUBSET ##
|
15 |
-
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
16 |
-
# attached to it, and one evaluation possible.
|
17 |
-
task = LightevalTaskConfig(
|
18 |
-
name="myothertask",
|
19 |
-
prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
20 |
-
suite=["community"],
|
21 |
-
hf_repo="",
|
22 |
-
hf_subset="default",
|
23 |
-
hf_avail_splits=[],
|
24 |
-
evaluation_splits=[],
|
25 |
-
few_shots_split="",
|
26 |
-
few_shots_select="",
|
27 |
-
metric=[""],
|
28 |
-
)
|
29 |
-
|
30 |
-
## EVALS WITH SUBSET
|
31 |
-
# This is how you create a subset task (like MMLU), which has several subset
|
32 |
-
# each being its own evaluation task.
|
33 |
-
|
34 |
-
# fmt: off
|
35 |
-
SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
|
36 |
-
# fmt: on
|
37 |
-
|
38 |
-
|
39 |
-
class CustomSubsetTask(LightevalTaskConfig):
|
40 |
-
def __init__(
|
41 |
-
self,
|
42 |
-
name,
|
43 |
-
hf_subset,
|
44 |
-
):
|
45 |
-
super().__init__(
|
46 |
-
name=name,
|
47 |
-
hf_subset=hf_subset,
|
48 |
-
prompt_function="prompt_fn", # must be defined in the file
|
49 |
-
hf_repo="",
|
50 |
-
metric=[""],
|
51 |
-
hf_avail_splits=[],
|
52 |
-
evaluation_splits=[],
|
53 |
-
few_shots_split="",
|
54 |
-
few_shots_select="",
|
55 |
-
suite=["community"],
|
56 |
-
generation_size=-1,
|
57 |
-
stop_sequence=None,
|
58 |
-
output_regex=None,
|
59 |
-
frozen=False,
|
60 |
-
)
|
61 |
-
|
62 |
-
|
63 |
-
## DEFINE YOUR PROMPT FUNCTIONS
|
64 |
-
# Define as many as you need for your different tasks
|
65 |
-
def prompt_fn(line, task_name: str = None):
|
66 |
-
"""Defines how to go from a dataset line to a doc object.
|
67 |
-
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
68 |
-
about what this function should do in the README.
|
69 |
-
"""
|
70 |
-
return Doc(
|
71 |
-
task_name=task_name,
|
72 |
-
query="",
|
73 |
-
choices="",
|
74 |
-
gold_index=0,
|
75 |
-
instruction="",
|
76 |
-
)
|
77 |
-
|
78 |
-
|
79 |
-
## STORE YOUR EVALS
|
80 |
-
SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
|
81 |
-
_TASKS = SUBSET_TASKS + [task]
|
82 |
-
|
83 |
-
## MODULE LOGIC
|
84 |
-
# You should not need to touch this
|
85 |
-
# Convert to dict for lighteval
|
86 |
-
TASKS_TABLE = [task.as_dict() for task in _TASKS]
|
87 |
-
|
88 |
-
if __name__ == "__main__":
|
89 |
-
print(t["name"] for t in TASKS_TABLE)
|
90 |
-
print(len(TASKS_TABLE))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main_backend_harness.py
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import pprint
|
3 |
-
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
-
|
6 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
-
|
8 |
-
from src.backend.run_eval_suite_harness import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
-
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
|
13 |
-
from src.envs import TASKS_HARNESS, NUM_FEWSHOT
|
14 |
-
from src.logging import setup_logger
|
15 |
-
|
16 |
-
# logging.basicConfig(level=logging.ERROR)
|
17 |
-
logger = setup_logger(__name__)
|
18 |
-
pp = pprint.PrettyPrinter(width=80)
|
19 |
-
|
20 |
-
PENDING_STATUS = "PENDING"
|
21 |
-
RUNNING_STATUS = "RUNNING"
|
22 |
-
FINISHED_STATUS = "FINISHED"
|
23 |
-
FAILED_STATUS = "FAILED"
|
24 |
-
|
25 |
-
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
26 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
27 |
-
|
28 |
-
def run_auto_eval():
|
29 |
-
current_pending_status = [PENDING_STATUS]
|
30 |
-
|
31 |
-
# pull the eval dataset from the hub and parse any eval requests
|
32 |
-
# check completed evals and set them to finished
|
33 |
-
check_completed_evals(
|
34 |
-
api=API,
|
35 |
-
checked_status=RUNNING_STATUS,
|
36 |
-
completed_status=FINISHED_STATUS,
|
37 |
-
failed_status=FAILED_STATUS,
|
38 |
-
hf_repo=QUEUE_REPO,
|
39 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
40 |
-
hf_repo_results=RESULTS_REPO,
|
41 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
42 |
-
)
|
43 |
-
|
44 |
-
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
45 |
-
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
46 |
-
# Sort the evals by priority (first submitted first run)
|
47 |
-
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
48 |
-
|
49 |
-
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
50 |
-
|
51 |
-
if len(eval_requests) == 0:
|
52 |
-
return
|
53 |
-
|
54 |
-
eval_request = eval_requests[0]
|
55 |
-
logger.info(pp.pformat(eval_request))
|
56 |
-
|
57 |
-
set_eval_request(
|
58 |
-
api=API,
|
59 |
-
eval_request=eval_request,
|
60 |
-
set_to_status=RUNNING_STATUS,
|
61 |
-
hf_repo=QUEUE_REPO,
|
62 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
63 |
-
)
|
64 |
-
|
65 |
-
print("eval request is")
|
66 |
-
print(eval_request)
|
67 |
-
run_evaluation(
|
68 |
-
eval_request=eval_request,
|
69 |
-
task_names=TASKS_HARNESS,
|
70 |
-
num_fewshot=NUM_FEWSHOT,
|
71 |
-
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
72 |
-
results_repo=RESULTS_REPO,
|
73 |
-
batch_size='auto',
|
74 |
-
device=DEVICE,
|
75 |
-
limit=LIMIT
|
76 |
-
)
|
77 |
-
|
78 |
-
|
79 |
-
if __name__ == "__main__":
|
80 |
-
run_auto_eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main_backend_lighteval.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import pprint
|
3 |
-
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
-
|
6 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
-
|
8 |
-
from src.backend.run_eval_suite_lighteval import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
-
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
|
13 |
-
from src.logging import setup_logger
|
14 |
-
|
15 |
-
logger = setup_logger(__name__)
|
16 |
-
|
17 |
-
# logging.basicConfig(level=logging.ERROR)
|
18 |
-
pp = pprint.PrettyPrinter(width=80)
|
19 |
-
|
20 |
-
PENDING_STATUS = "PENDING"
|
21 |
-
RUNNING_STATUS = "RUNNING"
|
22 |
-
FINISHED_STATUS = "FINISHED"
|
23 |
-
FAILED_STATUS = "FAILED"
|
24 |
-
|
25 |
-
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
26 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
27 |
-
|
28 |
-
def run_auto_eval():
|
29 |
-
current_pending_status = [PENDING_STATUS]
|
30 |
-
|
31 |
-
# pull the eval dataset from the hub and parse any eval requests
|
32 |
-
# check completed evals and set them to finished
|
33 |
-
check_completed_evals(
|
34 |
-
api=API,
|
35 |
-
checked_status=RUNNING_STATUS,
|
36 |
-
completed_status=FINISHED_STATUS,
|
37 |
-
failed_status=FAILED_STATUS,
|
38 |
-
hf_repo=QUEUE_REPO,
|
39 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
40 |
-
hf_repo_results=RESULTS_REPO,
|
41 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
42 |
-
)
|
43 |
-
|
44 |
-
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
45 |
-
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
46 |
-
# Sort the evals by priority (first submitted first run)
|
47 |
-
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
48 |
-
|
49 |
-
logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
50 |
-
|
51 |
-
if len(eval_requests) == 0:
|
52 |
-
return
|
53 |
-
|
54 |
-
eval_request = eval_requests[0]
|
55 |
-
logger.info(pp.pformat(eval_request))
|
56 |
-
|
57 |
-
|
58 |
-
set_eval_request(
|
59 |
-
api=API,
|
60 |
-
eval_request=eval_request,
|
61 |
-
set_to_status=RUNNING_STATUS,
|
62 |
-
hf_repo=QUEUE_REPO,
|
63 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
64 |
-
)
|
65 |
-
|
66 |
-
# This needs to be done
|
67 |
-
#instance_size, instance_type = get_instance_for_model(eval_request)
|
68 |
-
# For GPU
|
69 |
-
# instance_size, instance_type = "small", "g4dn.xlarge"
|
70 |
-
# For CPU
|
71 |
-
instance_size, instance_type = "medium", "c6i"
|
72 |
-
logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
|
73 |
-
|
74 |
-
run_evaluation(
|
75 |
-
eval_request=eval_request,
|
76 |
-
task_names=TASKS_LIGHTEVAL,
|
77 |
-
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
78 |
-
batch_size=1,
|
79 |
-
accelerator=ACCELERATOR,
|
80 |
-
region=REGION,
|
81 |
-
vendor=VENDOR,
|
82 |
-
instance_size=instance_size,
|
83 |
-
instance_type=instance_type,
|
84 |
-
limit=LIMIT
|
85 |
-
)
|
86 |
-
|
87 |
-
logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
|
88 |
-
|
89 |
-
|
90 |
-
if __name__ == "__main__":
|
91 |
-
run_auto_eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/run_eval_suite_harness.py
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import logging
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
from lm_eval import tasks, evaluator, utils
|
7 |
-
from lm_eval.tasks import TaskManager
|
8 |
-
|
9 |
-
from src.envs import RESULTS_REPO, API
|
10 |
-
from src.backend.manage_requests import EvalRequest
|
11 |
-
from src.logging import setup_logger
|
12 |
-
|
13 |
-
import numpy as np
|
14 |
-
|
15 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
16 |
-
logger = setup_logger(__name__)
|
17 |
-
|
18 |
-
class NumpyArrayEncoder(json.JSONEncoder):
|
19 |
-
def default(self, obj):
|
20 |
-
if isinstance(obj, np.ndarray):
|
21 |
-
return obj.tolist()
|
22 |
-
elif isinstance(obj, np.integer):
|
23 |
-
return int(obj)
|
24 |
-
elif isinstance(obj, np.floating):
|
25 |
-
return float(obj)
|
26 |
-
else:
|
27 |
-
return super().default(obj)
|
28 |
-
|
29 |
-
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int or str, device: str, local_dir: str, results_repo: str, limit: int =None):
|
30 |
-
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
31 |
-
|
32 |
-
Args:
|
33 |
-
eval_request (EvalRequest): Input evaluation request file representation
|
34 |
-
task_names (list): Tasks to launch
|
35 |
-
num_fewshot (int): Number of few shots to use
|
36 |
-
batch_size (int): Selected batch size
|
37 |
-
device (str): "cpu" or "gpu:0", depending on what you assigned to the space
|
38 |
-
local_dir (str): Where to save the results locally
|
39 |
-
results_repo (str): To which repository to upload the results
|
40 |
-
no_cache (bool, optional): Whether to use a cache or not.
|
41 |
-
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
42 |
-
|
43 |
-
Returns:
|
44 |
-
_type_: _description_
|
45 |
-
"""
|
46 |
-
if limit:
|
47 |
-
logger.info(
|
48 |
-
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
49 |
-
)
|
50 |
-
|
51 |
-
task_manager = TaskManager()
|
52 |
-
all_tasks = task_manager.all_tasks
|
53 |
-
|
54 |
-
task_names = utils.pattern_match(task_names, all_tasks)
|
55 |
-
|
56 |
-
logger.info(f"Selected Tasks: {task_names}")
|
57 |
-
|
58 |
-
print("Here are the deets we are putting in:")
|
59 |
-
print("model args:")
|
60 |
-
model_args = eval_request.get_model_args()
|
61 |
-
print(model_args)
|
62 |
-
print("Tasks")
|
63 |
-
print(task_names)
|
64 |
-
print("Num fewshot")
|
65 |
-
print(num_fewshot)
|
66 |
-
print("Batch size")
|
67 |
-
print(batch_size)
|
68 |
-
print("Device")
|
69 |
-
print(device)
|
70 |
-
print("Limit")
|
71 |
-
print(limit)
|
72 |
-
# no_cache=no_cache,
|
73 |
-
# output_base_path="logs"
|
74 |
-
#gen_kwargs="max_new_tokens=None"
|
75 |
-
results = evaluator.simple_evaluate(
|
76 |
-
model="hf", #= "hf-causal-experimental", # "hf-causal"
|
77 |
-
model_args=model_args,
|
78 |
-
tasks=task_names,
|
79 |
-
num_fewshot=num_fewshot,
|
80 |
-
batch_size=batch_size,
|
81 |
-
device=device,
|
82 |
-
limit=limit,
|
83 |
-
write_out=True,
|
84 |
-
gen_kwargs="max_new_tokens=400"
|
85 |
-
)
|
86 |
-
|
87 |
-
# TODO: For Toxicity, substract from 100.
|
88 |
-
# results["realtoxicityprompts"]["score"] = 100 - results["realtoxicityprompts"]["score"]
|
89 |
-
results["config"]["model_dtype"] = eval_request.precision
|
90 |
-
results["config"]["model_name"] = eval_request.model
|
91 |
-
results["config"]["model_sha"] = eval_request.revision
|
92 |
-
|
93 |
-
print("Results is")
|
94 |
-
print(results)
|
95 |
-
|
96 |
-
dumped = json.dumps(results, cls=NumpyArrayEncoder, indent=2)
|
97 |
-
logger.info(dumped)
|
98 |
-
|
99 |
-
output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
100 |
-
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
101 |
-
with open(output_path, "w") as f:
|
102 |
-
f.write(dumped)
|
103 |
-
|
104 |
-
logger.info(utils.make_table(results))
|
105 |
-
|
106 |
-
print("Uploading to")
|
107 |
-
print(output_path)
|
108 |
-
print("repo id")
|
109 |
-
print(results_repo)
|
110 |
-
|
111 |
-
API.upload_file(
|
112 |
-
path_or_fileobj=output_path,
|
113 |
-
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
114 |
-
repo_id=results_repo,
|
115 |
-
repo_type="dataset",
|
116 |
-
)
|
117 |
-
|
118 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/run_eval_suite_lighteval.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import argparse
|
3 |
-
import logging
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
7 |
-
|
8 |
-
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
9 |
-
from src.backend.manage_requests import EvalRequest
|
10 |
-
from src.logging import setup_logger
|
11 |
-
|
12 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
-
logger = setup_logger(__name__)
|
14 |
-
|
15 |
-
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
16 |
-
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
eval_request (EvalRequest): Input evaluation request file representation
|
20 |
-
task_names (list): Tasks to launch
|
21 |
-
batch_size (int): Selected batch size
|
22 |
-
accelerator (str): Inference endpoint parameter for running the evaluation
|
23 |
-
region (str): Inference endpoint parameter for running the evaluation
|
24 |
-
vendor (str): Inference endpoint parameter for running the evaluation
|
25 |
-
instance_size (str): Inference endpoint parameter for running the evaluation
|
26 |
-
instance_type (str): Inference endpoint parameter for running the evaluation
|
27 |
-
local_dir (str): Where to save the results locally
|
28 |
-
no_cache (bool, optional): Whether to use a cache or not.
|
29 |
-
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
30 |
-
"""
|
31 |
-
|
32 |
-
if limit:
|
33 |
-
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
34 |
-
|
35 |
-
args_dict = {
|
36 |
-
# Endpoint parameters
|
37 |
-
"endpoint_model_name":eval_request.model,
|
38 |
-
"accelerator": accelerator,
|
39 |
-
"vendor": vendor,
|
40 |
-
"region": region,
|
41 |
-
"instance_size": instance_size,
|
42 |
-
"instance_type": instance_type,
|
43 |
-
"reuse_existing": False,
|
44 |
-
"model_dtype": eval_request.precision,
|
45 |
-
"revision": eval_request.revision,
|
46 |
-
# Save parameters
|
47 |
-
"push_results_to_hub": True,
|
48 |
-
"save_details": True,
|
49 |
-
"push_details_to_hub": True,
|
50 |
-
"public_run": False,
|
51 |
-
"cache_dir": CACHE_PATH,
|
52 |
-
"results_org": RESULTS_REPO,
|
53 |
-
"output_dir": local_dir,
|
54 |
-
"job_id": str(datetime.now()),
|
55 |
-
# Experiment parameters
|
56 |
-
"override_batch_size": batch_size,
|
57 |
-
"custom_tasks": "custom_tasks.py",
|
58 |
-
"tasks": task_names,
|
59 |
-
"max_samples": limit,
|
60 |
-
"use_chat_template": False,
|
61 |
-
"system_prompt": None,
|
62 |
-
# Parameters which would be set to things by the kwargs if actually using argparse
|
63 |
-
"inference_server_address": None,
|
64 |
-
"model_args": None,
|
65 |
-
"num_fewshot_seeds": None,
|
66 |
-
"delta_weights": False,
|
67 |
-
"adapter_weights": False
|
68 |
-
}
|
69 |
-
args = argparse.Namespace(**args_dict)
|
70 |
-
|
71 |
-
try:
|
72 |
-
results = main(args)
|
73 |
-
|
74 |
-
results["config"]["model_dtype"] = eval_request.precision
|
75 |
-
results["config"]["model_name"] = eval_request.model
|
76 |
-
results["config"]["model_sha"] = eval_request.revision
|
77 |
-
|
78 |
-
dumped = json.dumps(results, indent=2)
|
79 |
-
logger.info(dumped)
|
80 |
-
except Exception as e: # if eval failed, we force a cleanup
|
81 |
-
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
82 |
-
|
83 |
-
model_config = create_model_config(args=args, accelerator=accelerator)
|
84 |
-
model, _ = load_model(config=model_config, env_config=env_config)
|
85 |
-
model.cleanup()
|
86 |
-
|
87 |
-
|
88 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/run_toxicity_eval.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
import logging
|
4 |
import time
|
5 |
from datetime import datetime
|
6 |
import sys
|
@@ -18,7 +17,6 @@ from statistics import mean
|
|
18 |
from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
|
19 |
# QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
|
20 |
from src.logging import setup_logger
|
21 |
-
logging.basicConfig(level=logging.INFO)
|
22 |
logger = setup_logger(__name__)
|
23 |
|
24 |
#HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import time
|
4 |
from datetime import datetime
|
5 |
import sys
|
|
|
17 |
from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
|
18 |
# QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
|
19 |
from src.logging import setup_logger
|
|
|
20 |
logger = setup_logger(__name__)
|
21 |
|
22 |
#HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"
|