meg-huggingface commited on
Commit
4f55f5f
1 Parent(s): 798ff9d

Deleting logging that we're not using

Browse files
custom_tasks.py DELETED
@@ -1,90 +0,0 @@
1
- # ruff: noqa: F405, F403, F401
2
- """
3
- Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
4
-
5
- This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
6
-
7
- Author:
8
- """
9
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
10
- from lighteval.tasks.requests import Doc
11
- from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
12
-
13
-
14
- ## EVAL WITH NO SUBSET ##
15
- # This is how you create a simple tasks (like hellaswag) which has one single subset
16
- # attached to it, and one evaluation possible.
17
- task = LightevalTaskConfig(
18
- name="myothertask",
19
- prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
20
- suite=["community"],
21
- hf_repo="",
22
- hf_subset="default",
23
- hf_avail_splits=[],
24
- evaluation_splits=[],
25
- few_shots_split="",
26
- few_shots_select="",
27
- metric=[""],
28
- )
29
-
30
- ## EVALS WITH SUBSET
31
- # This is how you create a subset task (like MMLU), which has several subset
32
- # each being its own evaluation task.
33
-
34
- # fmt: off
35
- SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
36
- # fmt: on
37
-
38
-
39
- class CustomSubsetTask(LightevalTaskConfig):
40
- def __init__(
41
- self,
42
- name,
43
- hf_subset,
44
- ):
45
- super().__init__(
46
- name=name,
47
- hf_subset=hf_subset,
48
- prompt_function="prompt_fn", # must be defined in the file
49
- hf_repo="",
50
- metric=[""],
51
- hf_avail_splits=[],
52
- evaluation_splits=[],
53
- few_shots_split="",
54
- few_shots_select="",
55
- suite=["community"],
56
- generation_size=-1,
57
- stop_sequence=None,
58
- output_regex=None,
59
- frozen=False,
60
- )
61
-
62
-
63
- ## DEFINE YOUR PROMPT FUNCTIONS
64
- # Define as many as you need for your different tasks
65
- def prompt_fn(line, task_name: str = None):
66
- """Defines how to go from a dataset line to a doc object.
67
- Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
68
- about what this function should do in the README.
69
- """
70
- return Doc(
71
- task_name=task_name,
72
- query="",
73
- choices="",
74
- gold_index=0,
75
- instruction="",
76
- )
77
-
78
-
79
- ## STORE YOUR EVALS
80
- SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
81
- _TASKS = SUBSET_TASKS + [task]
82
-
83
- ## MODULE LOGIC
84
- # You should not need to touch this
85
- # Convert to dict for lighteval
86
- TASKS_TABLE = [task.as_dict() for task in _TASKS]
87
-
88
- if __name__ == "__main__":
89
- print(t["name"] for t in TASKS_TABLE)
90
- print(len(TASKS_TABLE))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main_backend_harness.py DELETED
@@ -1,80 +0,0 @@
1
- import logging
2
- import pprint
3
-
4
- from huggingface_hub import snapshot_download
5
-
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
- from src.backend.run_eval_suite_harness import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
- from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
13
- from src.envs import TASKS_HARNESS, NUM_FEWSHOT
14
- from src.logging import setup_logger
15
-
16
- # logging.basicConfig(level=logging.ERROR)
17
- logger = setup_logger(__name__)
18
- pp = pprint.PrettyPrinter(width=80)
19
-
20
- PENDING_STATUS = "PENDING"
21
- RUNNING_STATUS = "RUNNING"
22
- FINISHED_STATUS = "FINISHED"
23
- FAILED_STATUS = "FAILED"
24
-
25
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
26
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
27
-
28
- def run_auto_eval():
29
- current_pending_status = [PENDING_STATUS]
30
-
31
- # pull the eval dataset from the hub and parse any eval requests
32
- # check completed evals and set them to finished
33
- check_completed_evals(
34
- api=API,
35
- checked_status=RUNNING_STATUS,
36
- completed_status=FINISHED_STATUS,
37
- failed_status=FAILED_STATUS,
38
- hf_repo=QUEUE_REPO,
39
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
40
- hf_repo_results=RESULTS_REPO,
41
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
42
- )
43
-
44
- # Get all eval request that are PENDING, if you want to run other evals, change this parameter
45
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
46
- # Sort the evals by priority (first submitted first run)
47
- eval_requests = sort_models_by_priority(api=API, models=eval_requests)
48
-
49
- print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
50
-
51
- if len(eval_requests) == 0:
52
- return
53
-
54
- eval_request = eval_requests[0]
55
- logger.info(pp.pformat(eval_request))
56
-
57
- set_eval_request(
58
- api=API,
59
- eval_request=eval_request,
60
- set_to_status=RUNNING_STATUS,
61
- hf_repo=QUEUE_REPO,
62
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
63
- )
64
-
65
- print("eval request is")
66
- print(eval_request)
67
- run_evaluation(
68
- eval_request=eval_request,
69
- task_names=TASKS_HARNESS,
70
- num_fewshot=NUM_FEWSHOT,
71
- local_dir=EVAL_RESULTS_PATH_BACKEND,
72
- results_repo=RESULTS_REPO,
73
- batch_size='auto',
74
- device=DEVICE,
75
- limit=LIMIT
76
- )
77
-
78
-
79
- if __name__ == "__main__":
80
- run_auto_eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main_backend_lighteval.py DELETED
@@ -1,91 +0,0 @@
1
- import logging
2
- import pprint
3
-
4
- from huggingface_hub import snapshot_download
5
-
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
- from src.backend.run_eval_suite_lighteval import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
- from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
13
- from src.logging import setup_logger
14
-
15
- logger = setup_logger(__name__)
16
-
17
- # logging.basicConfig(level=logging.ERROR)
18
- pp = pprint.PrettyPrinter(width=80)
19
-
20
- PENDING_STATUS = "PENDING"
21
- RUNNING_STATUS = "RUNNING"
22
- FINISHED_STATUS = "FINISHED"
23
- FAILED_STATUS = "FAILED"
24
-
25
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
26
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
27
-
28
- def run_auto_eval():
29
- current_pending_status = [PENDING_STATUS]
30
-
31
- # pull the eval dataset from the hub and parse any eval requests
32
- # check completed evals and set them to finished
33
- check_completed_evals(
34
- api=API,
35
- checked_status=RUNNING_STATUS,
36
- completed_status=FINISHED_STATUS,
37
- failed_status=FAILED_STATUS,
38
- hf_repo=QUEUE_REPO,
39
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
40
- hf_repo_results=RESULTS_REPO,
41
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
42
- )
43
-
44
- # Get all eval request that are PENDING, if you want to run other evals, change this parameter
45
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
46
- # Sort the evals by priority (first submitted first run)
47
- eval_requests = sort_models_by_priority(api=API, models=eval_requests)
48
-
49
- logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
50
-
51
- if len(eval_requests) == 0:
52
- return
53
-
54
- eval_request = eval_requests[0]
55
- logger.info(pp.pformat(eval_request))
56
-
57
-
58
- set_eval_request(
59
- api=API,
60
- eval_request=eval_request,
61
- set_to_status=RUNNING_STATUS,
62
- hf_repo=QUEUE_REPO,
63
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
64
- )
65
-
66
- # This needs to be done
67
- #instance_size, instance_type = get_instance_for_model(eval_request)
68
- # For GPU
69
- # instance_size, instance_type = "small", "g4dn.xlarge"
70
- # For CPU
71
- instance_size, instance_type = "medium", "c6i"
72
- logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
73
-
74
- run_evaluation(
75
- eval_request=eval_request,
76
- task_names=TASKS_LIGHTEVAL,
77
- local_dir=EVAL_RESULTS_PATH_BACKEND,
78
- batch_size=1,
79
- accelerator=ACCELERATOR,
80
- region=REGION,
81
- vendor=VENDOR,
82
- instance_size=instance_size,
83
- instance_type=instance_type,
84
- limit=LIMIT
85
- )
86
-
87
- logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
88
-
89
-
90
- if __name__ == "__main__":
91
- run_auto_eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/run_eval_suite_harness.py DELETED
@@ -1,118 +0,0 @@
1
- import json
2
- import os
3
- import logging
4
- from datetime import datetime
5
-
6
- from lm_eval import tasks, evaluator, utils
7
- from lm_eval.tasks import TaskManager
8
-
9
- from src.envs import RESULTS_REPO, API
10
- from src.backend.manage_requests import EvalRequest
11
- from src.logging import setup_logger
12
-
13
- import numpy as np
14
-
15
- logging.getLogger("openai").setLevel(logging.WARNING)
16
- logger = setup_logger(__name__)
17
-
18
- class NumpyArrayEncoder(json.JSONEncoder):
19
- def default(self, obj):
20
- if isinstance(obj, np.ndarray):
21
- return obj.tolist()
22
- elif isinstance(obj, np.integer):
23
- return int(obj)
24
- elif isinstance(obj, np.floating):
25
- return float(obj)
26
- else:
27
- return super().default(obj)
28
-
29
- def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int or str, device: str, local_dir: str, results_repo: str, limit: int =None):
30
- """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
31
-
32
- Args:
33
- eval_request (EvalRequest): Input evaluation request file representation
34
- task_names (list): Tasks to launch
35
- num_fewshot (int): Number of few shots to use
36
- batch_size (int): Selected batch size
37
- device (str): "cpu" or "gpu:0", depending on what you assigned to the space
38
- local_dir (str): Where to save the results locally
39
- results_repo (str): To which repository to upload the results
40
- no_cache (bool, optional): Whether to use a cache or not.
41
- limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
42
-
43
- Returns:
44
- _type_: _description_
45
- """
46
- if limit:
47
- logger.info(
48
- "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
49
- )
50
-
51
- task_manager = TaskManager()
52
- all_tasks = task_manager.all_tasks
53
-
54
- task_names = utils.pattern_match(task_names, all_tasks)
55
-
56
- logger.info(f"Selected Tasks: {task_names}")
57
-
58
- print("Here are the deets we are putting in:")
59
- print("model args:")
60
- model_args = eval_request.get_model_args()
61
- print(model_args)
62
- print("Tasks")
63
- print(task_names)
64
- print("Num fewshot")
65
- print(num_fewshot)
66
- print("Batch size")
67
- print(batch_size)
68
- print("Device")
69
- print(device)
70
- print("Limit")
71
- print(limit)
72
- # no_cache=no_cache,
73
- # output_base_path="logs"
74
- #gen_kwargs="max_new_tokens=None"
75
- results = evaluator.simple_evaluate(
76
- model="hf", #= "hf-causal-experimental", # "hf-causal"
77
- model_args=model_args,
78
- tasks=task_names,
79
- num_fewshot=num_fewshot,
80
- batch_size=batch_size,
81
- device=device,
82
- limit=limit,
83
- write_out=True,
84
- gen_kwargs="max_new_tokens=400"
85
- )
86
-
87
- # TODO: For Toxicity, substract from 100.
88
- # results["realtoxicityprompts"]["score"] = 100 - results["realtoxicityprompts"]["score"]
89
- results["config"]["model_dtype"] = eval_request.precision
90
- results["config"]["model_name"] = eval_request.model
91
- results["config"]["model_sha"] = eval_request.revision
92
-
93
- print("Results is")
94
- print(results)
95
-
96
- dumped = json.dumps(results, cls=NumpyArrayEncoder, indent=2)
97
- logger.info(dumped)
98
-
99
- output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
100
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
101
- with open(output_path, "w") as f:
102
- f.write(dumped)
103
-
104
- logger.info(utils.make_table(results))
105
-
106
- print("Uploading to")
107
- print(output_path)
108
- print("repo id")
109
- print(results_repo)
110
-
111
- API.upload_file(
112
- path_or_fileobj=output_path,
113
- path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
114
- repo_id=results_repo,
115
- repo_type="dataset",
116
- )
117
-
118
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/run_eval_suite_lighteval.py DELETED
@@ -1,88 +0,0 @@
1
- import json
2
- import argparse
3
- import logging
4
- from datetime import datetime
5
-
6
- from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
7
-
8
- from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
9
- from src.backend.manage_requests import EvalRequest
10
- from src.logging import setup_logger
11
-
12
- logging.getLogger("openai").setLevel(logging.WARNING)
13
- logger = setup_logger(__name__)
14
-
15
- def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
16
- """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
17
-
18
- Args:
19
- eval_request (EvalRequest): Input evaluation request file representation
20
- task_names (list): Tasks to launch
21
- batch_size (int): Selected batch size
22
- accelerator (str): Inference endpoint parameter for running the evaluation
23
- region (str): Inference endpoint parameter for running the evaluation
24
- vendor (str): Inference endpoint parameter for running the evaluation
25
- instance_size (str): Inference endpoint parameter for running the evaluation
26
- instance_type (str): Inference endpoint parameter for running the evaluation
27
- local_dir (str): Where to save the results locally
28
- no_cache (bool, optional): Whether to use a cache or not.
29
- limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
30
- """
31
-
32
- if limit:
33
- logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
34
-
35
- args_dict = {
36
- # Endpoint parameters
37
- "endpoint_model_name":eval_request.model,
38
- "accelerator": accelerator,
39
- "vendor": vendor,
40
- "region": region,
41
- "instance_size": instance_size,
42
- "instance_type": instance_type,
43
- "reuse_existing": False,
44
- "model_dtype": eval_request.precision,
45
- "revision": eval_request.revision,
46
- # Save parameters
47
- "push_results_to_hub": True,
48
- "save_details": True,
49
- "push_details_to_hub": True,
50
- "public_run": False,
51
- "cache_dir": CACHE_PATH,
52
- "results_org": RESULTS_REPO,
53
- "output_dir": local_dir,
54
- "job_id": str(datetime.now()),
55
- # Experiment parameters
56
- "override_batch_size": batch_size,
57
- "custom_tasks": "custom_tasks.py",
58
- "tasks": task_names,
59
- "max_samples": limit,
60
- "use_chat_template": False,
61
- "system_prompt": None,
62
- # Parameters which would be set to things by the kwargs if actually using argparse
63
- "inference_server_address": None,
64
- "model_args": None,
65
- "num_fewshot_seeds": None,
66
- "delta_weights": False,
67
- "adapter_weights": False
68
- }
69
- args = argparse.Namespace(**args_dict)
70
-
71
- try:
72
- results = main(args)
73
-
74
- results["config"]["model_dtype"] = eval_request.precision
75
- results["config"]["model_name"] = eval_request.model
76
- results["config"]["model_sha"] = eval_request.revision
77
-
78
- dumped = json.dumps(results, indent=2)
79
- logger.info(dumped)
80
- except Exception as e: # if eval failed, we force a cleanup
81
- env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
82
-
83
- model_config = create_model_config(args=args, accelerator=accelerator)
84
- model, _ = load_model(config=model_config, env_config=env_config)
85
- model.cleanup()
86
-
87
-
88
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/run_toxicity_eval.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import os
3
- import logging
4
  import time
5
  from datetime import datetime
6
  import sys
@@ -18,7 +17,6 @@ from statistics import mean
18
  from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
19
  # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
20
  from src.logging import setup_logger
21
- logging.basicConfig(level=logging.INFO)
22
  logger = setup_logger(__name__)
23
 
24
  #HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"
 
1
  import json
2
  import os
 
3
  import time
4
  from datetime import datetime
5
  import sys
 
17
  from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
18
  # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
19
  from src.logging import setup_logger
 
20
  logger = setup_logger(__name__)
21
 
22
  #HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"