backend

Sleeping

App Files Files Community

meg-huggingface commited on Jul 19

Commit

4f55f5f

•

1 Parent(s): 798ff9d

Deleting logging that we're not using

Browse files

Files changed (6) hide show

custom_tasks.py +0 -90
main_backend_harness.py +0 -80
main_backend_lighteval.py +0 -91
src/backend/run_eval_suite_harness.py +0 -118
src/backend/run_eval_suite_lighteval.py +0 -88
src/backend/run_toxicity_eval.py +0 -2

custom_tasks.py DELETED Viewed

@@ -1,90 +0,0 @@
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
-This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-Author:
-"""
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-## EVAL WITH NO SUBSET ##
-# This is how you create a simple tasks (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
-task = LightevalTaskConfig(
-    name="myothertask",
-    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
-    suite=["community"],
-    hf_repo="",
-    hf_subset="default",
-    hf_avail_splits=[],
-    evaluation_splits=[],
-    few_shots_split="",
-    few_shots_select="",
-    metric=[""],
-)
-## EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
-# fmt: off
-SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
-# fmt: on
-class CustomSubsetTask(LightevalTaskConfig):
-    def __init__(
-        self,
-        name,
-        hf_subset,
-    ):
-        super().__init__(
-            name=name,
-            hf_subset=hf_subset,
-            prompt_function="prompt_fn",  # must be defined in the file
-            hf_repo="",
-            metric=[""],
-            hf_avail_splits=[],
-            evaluation_splits=[],
-            few_shots_split="",
-            few_shots_select="",
-            suite=["community"],
-            generation_size=-1,
-            stop_sequence=None,
-            output_regex=None,
-            frozen=False,
-        )
-## DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
-def prompt_fn(line, task_name: str = None):
-    """Defines how to go from a dataset line to a doc object.
-    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
-    about what this function should do in the README.
-    """
-    return Doc(
-        task_name=task_name,
-        query="",
-        choices="",
-        gold_index=0,
-        instruction="",
-    )
-## STORE YOUR EVALS
-SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
-_TASKS = SUBSET_TASKS + [task]
-## MODULE LOGIC
-# You should not need to touch this
-# Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in _TASKS]
-if __name__ == "__main__":
-    print(t["name"] for t in TASKS_TABLE)
-    print(len(TASKS_TABLE))

main_backend_harness.py DELETED Viewed

@@ -1,80 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from src.backend.run_eval_suite_harness import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
-from src.envs import TASKS_HARNESS, NUM_FEWSHOT
-from src.logging import setup_logger
-# logging.basicConfig(level=logging.ERROR)
-logger = setup_logger(__name__)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    logger.info(pp.pformat(eval_request))
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    print("eval request is")
-    print(eval_request)
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        num_fewshot=NUM_FEWSHOT,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        results_repo=RESULTS_REPO,
-        batch_size='auto',
-        device=DEVICE,
-        limit=LIMIT
-        )
-if __name__ == "__main__":
-    run_auto_eval()

main_backend_lighteval.py DELETED Viewed

@@ -1,91 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from src.backend.run_eval_suite_lighteval import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
-from src.logging import setup_logger
-logger = setup_logger(__name__)
-# logging.basicConfig(level=logging.ERROR)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    logger.info(pp.pformat(eval_request))
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    # This needs to be done
-    #instance_size, instance_type = get_instance_for_model(eval_request)
-    # For GPU
-    # instance_size, instance_type = "small", "g4dn.xlarge"
-    # For CPU
-    instance_size, instance_type = "medium", "c6i"
-    logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_LIGHTEVAL,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        batch_size=1,
-        accelerator=ACCELERATOR,
-        region=REGION,
-        vendor=VENDOR,
-        instance_size=instance_size,
-        instance_type=instance_type,
-        limit=LIMIT
-        )
-    logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
-if __name__ == "__main__":
-    run_auto_eval()

src/backend/run_eval_suite_harness.py DELETED Viewed

@@ -1,118 +0,0 @@
-import json
-import os
-import logging
-from datetime import datetime
-from lm_eval import tasks, evaluator, utils
-from lm_eval.tasks import TaskManager
-from src.envs import RESULTS_REPO, API
-from src.backend.manage_requests import EvalRequest
-from src.logging import setup_logger
-import numpy as np
-logging.getLogger("openai").setLevel(logging.WARNING)
-logger = setup_logger(__name__)
-class NumpyArrayEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        elif isinstance(obj, np.integer):
-            return int(obj)
-        elif isinstance(obj, np.floating):
-            return float(obj)
-        else:
-            return super().default(obj)
-def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int or str, device: str, local_dir: str, results_repo: str, limit: int =None):
-    """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
-    Args:
-        eval_request (EvalRequest): Input evaluation request file representation
-        task_names (list): Tasks to launch
-        num_fewshot (int): Number of few shots to use
-        batch_size (int): Selected batch size
-        device (str): "cpu" or "gpu:0", depending on what you assigned to the space
-        local_dir (str): Where to save the results locally
-        results_repo (str): To which repository to upload the results
-        no_cache (bool, optional): Whether to use a cache or not.
-        limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
-    Returns:
-        _type_: _description_
-    """
-    if limit:
-        logger.info(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    task_manager = TaskManager()
-    all_tasks = task_manager.all_tasks
-    task_names = utils.pattern_match(task_names, all_tasks)
-    logger.info(f"Selected Tasks: {task_names}")
-    print("Here are the deets we are putting in:")
-    print("model args:")
-    model_args = eval_request.get_model_args()
-    print(model_args)
-    print("Tasks")
-    print(task_names)
-    print("Num fewshot")
-    print(num_fewshot)
-    print("Batch size")
-    print(batch_size)
-    print("Device")
-    print(device)
-    print("Limit")
-    print(limit)
-    # no_cache=no_cache,
-    # output_base_path="logs"
-    #gen_kwargs="max_new_tokens=None"
-    results = evaluator.simple_evaluate(
-        model="hf", #= "hf-causal-experimental", # "hf-causal"
-        model_args=model_args,
-        tasks=task_names,
-        num_fewshot=num_fewshot,
-        batch_size=batch_size,
-        device=device,
-        limit=limit,
-        write_out=True,
-        gen_kwargs="max_new_tokens=400"
-    )
-    # TODO: For Toxicity, substract from 100.
-    #     results["realtoxicityprompts"]["score"] = 100 - results["realtoxicityprompts"]["score"]
-    results["config"]["model_dtype"] = eval_request.precision
-    results["config"]["model_name"] = eval_request.model
-    results["config"]["model_sha"] = eval_request.revision
-    print("Results is")
-    print(results)
-    dumped = json.dumps(results, cls=NumpyArrayEncoder, indent=2)
-    logger.info(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(dumped)
-    logger.info(utils.make_table(results))
-    print("Uploading to")
-    print(output_path)
-    print("repo id")
-    print(results_repo)
-    API.upload_file(
-        path_or_fileobj=output_path,
-        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-        repo_id=results_repo,
-        repo_type="dataset",
-    )
-    return results

src/backend/run_eval_suite_lighteval.py DELETED Viewed

@@ -1,88 +0,0 @@
-import json
-import argparse
-import logging
-from datetime import datetime
-from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
-from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
-from src.backend.manage_requests import EvalRequest
-from src.logging import setup_logger
-logging.getLogger("openai").setLevel(logging.WARNING)
-logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
-    """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
-    Args:
-        eval_request (EvalRequest): Input evaluation request file representation
-        task_names (list): Tasks to launch
-        batch_size (int): Selected batch size
-        accelerator (str): Inference endpoint parameter for running the evaluation
-        region (str):  Inference endpoint parameter for running the evaluation
-        vendor (str):  Inference endpoint parameter for running the evaluation
-        instance_size (str):  Inference endpoint parameter for running the evaluation
-        instance_type (str):  Inference endpoint parameter for running the evaluation
-        local_dir (str): Where to save the results locally
-        no_cache (bool, optional): Whether to use a cache or not.
-        limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
-    """
-    if limit:
-        logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-    args_dict = {
-            # Endpoint parameters
-            "endpoint_model_name":eval_request.model,
-            "accelerator": accelerator,
-            "vendor": vendor,
-            "region": region,
-            "instance_size": instance_size,
-            "instance_type": instance_type,
-            "reuse_existing": False,
-            "model_dtype": eval_request.precision,
-            "revision": eval_request.revision,
-            # Save parameters
-            "push_results_to_hub": True,
-            "save_details": True,
-            "push_details_to_hub": True,
-            "public_run": False,
-            "cache_dir": CACHE_PATH,
-            "results_org": RESULTS_REPO,
-            "output_dir": local_dir,
-            "job_id": str(datetime.now()),
-            # Experiment parameters
-            "override_batch_size": batch_size,
-            "custom_tasks": "custom_tasks.py",
-            "tasks": task_names,
-            "max_samples": limit,
-            "use_chat_template": False,
-            "system_prompt": None,
-            # Parameters which would be set to things by the kwargs if actually using argparse
-            "inference_server_address": None,
-            "model_args": None,
-            "num_fewshot_seeds": None,
-            "delta_weights": False,
-            "adapter_weights": False
-    }
-    args = argparse.Namespace(**args_dict)
-    try:
-        results = main(args)
-        results["config"]["model_dtype"] = eval_request.precision
-        results["config"]["model_name"] = eval_request.model
-        results["config"]["model_sha"] = eval_request.revision
-        dumped = json.dumps(results, indent=2)
-        logger.info(dumped)
-    except Exception as e: # if eval failed, we force a cleanup
-        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-        model_config = create_model_config(args=args, accelerator=accelerator)
-        model, _ = load_model(config=model_config, env_config=env_config)
-        model.cleanup()
-    return results

src/backend/run_toxicity_eval.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import os
-import logging
 import time
 from datetime import datetime
 import sys
@@ -18,7 +17,6 @@ from statistics import mean
 from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
 # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
 from src.logging import setup_logger
-logging.basicConfig(level=logging.INFO)
 logger = setup_logger(__name__)
 #HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"

 import json
 import os
 import time
 from datetime import datetime
 import sys
 from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
 # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 #HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"