backend_demo

Paused

App Files Files Community

Shaltiel commited on May 27

Commit

9e82c5f

•

1 Parent(s): b128812

Added ilfacts task (not for eval yet) and updated backend with fallback

Browse files

Files changed (3) hide show

custom_tasks.py +2 -1
src/backend/run_eval_suite_lighteval.py +75 -62
src/custom_tasks/ilfacts_task.py +56 -0

custom_tasks.py CHANGED Viewed

@@ -11,11 +11,12 @@ from src.custom_tasks.sentiment_task import *
 from src.custom_tasks.winograd_task import *
 from src.custom_tasks.translation_task import *
 from src.custom_tasks.snli_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 from src.custom_tasks.winograd_task import *
 from src.custom_tasks.translation_task import *
 from src.custom_tasks.snli_task import *
+from src.custom_tasks.ilfacts_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task, ilfacts_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

src/backend/run_eval_suite_lighteval.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import logging
 from datetime import datetime
 from argparse import Namespace
 from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
 from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
@@ -10,6 +11,7 @@ from src.backend.manage_requests import EvalRequest
 from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
 from lighteval.models.model_loader import ModelInfo
 from huggingface_hub.errors import InferenceEndpointTimeoutError
 logging.getLogger("openai").setLevel(logging.WARNING)
@@ -21,70 +23,81 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-    args = DefaultNamespace(**{
-            "model_config": dict(model=dict(
-                type="endpoint",
-                base_params=dict(
-                    endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
-                    model=eval_request.model,
-                    revision=eval_request.revision,
-                    dtype=eval_request.precision,
-                    reuse_existing=False
-                ),
-                instance=dict(
-                    accelerator=accelerator,
-                    region=region,
-                    vendor=vendor,
-                    instance_size=instance_size,
-                    instance_type=instance_type,
-                    framework='pytorch',
-                    endpoint_type='protected',
-                    namespace=OWNER
-                ),
-                generation=dict(
-                    add_special_tokens=True
-                )
-            )),
-            "max_samples": limit,
-            "job_id": str(datetime.now()),
-            "push_results_to_hub": True,
-            "save_details": False,
-            "push_details_to_hub": False,
-            "public_run": False,
-            "cache_dir": CACHE_PATH,
-            "results_org": OWNER,
-            "output_dir": local_dir,
-            "override_batch_size": batch_size,
-            "custom_tasks": "custom_tasks.py",
-            "tasks": task_names,
-            "dataset_loading_processes": 24,
-            "num_fewshot_seeds": 0
-    })
-    try:
-        # in case of timeout, try it again with reuse_existing
-        for i in range(3):
             try:
-                results = main(args)
-                # if we are i>0, then raise an error so that we call clean up
-                if i > 0: raise Exception()
-            except InferenceEndpointTimeoutError:
-                if i < 3:
-                    print('Timed out, trying again...')
-                    args.model_config['model']['base_params']['reuse_existing'] = True
-        dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
-        print(dumped)
-    except Exception as ex: # if eval failed, we force a cleanup
-        import traceback
-        traceback.print_exception(ex)
-        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-        args.model_config['model']['base_params']['reuse_existing'] = True
-        model_config = create_model_config(args=args, accelerator=accelerator)
-        model, _ = load_model(config=model_config, env_config=env_config)
-        print('Cleaning up')
-        model.reuse_existing = False # force it to clean up
-        model.cleanup()
-        results = None
     return results

 import logging
 from datetime import datetime
 from argparse import Namespace
+import traceback
 from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
 from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
 from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
 from lighteval.models.model_loader import ModelInfo
 from huggingface_hub.errors import InferenceEndpointTimeoutError
+from huggingface_hub import HfApi
 logging.getLogger("openai").setLevel(logging.WARNING)
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    api = HfApi(token=TOKEN)
+    completed = False
+    for img_version in ['1.4.5', '2.0.3']:
+        args = DefaultNamespace(**{
+                "model_config": dict(model=dict(
+                    type="endpoint",
+                    base_params=dict(
+                        endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
+                        model=eval_request.model,
+                        revision=eval_request.revision,
+                        dtype=eval_request.precision,
+                        reuse_existing=False
+                    ),
+                    instance=dict(
+                        accelerator=accelerator,
+                        region=region,
+                        vendor=vendor,
+                        instance_size=instance_size,
+                        instance_type=instance_type,
+                        framework='pytorch',
+                        endpoint_type='protected',
+                        namespace=OWNER,
+                        image_url='ghcr.io/huggingface/text-generation-inference:' + img_version
+                    ),
+                    generation=dict(
+                        add_special_tokens=True
+                    )
+                )),
+                "max_samples": limit,
+                "job_id": str(datetime.now()),
+                "push_results_to_hub": True,
+                "save_details": False,
+                "push_details_to_hub": False,
+                "public_run": False,
+                "cache_dir": CACHE_PATH,
+                "results_org": OWNER,
+                "output_dir": local_dir,
+                "override_batch_size": batch_size,
+                "custom_tasks": "custom_tasks.py",
+                "tasks": task_names,
+                "dataset_loading_processes": 24,
+                "num_fewshot_seeds": 0
+        })
+        try:
+            # in case of timeout, try it again with reuse_existing
+            for i in range(3):
+                try:
+                    results = main(args)
+                    completed = True # success!
+                    dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
+                    print(dumped)
+                    # if we are i>0, then raise an error so that we call clean up
+                    if i > 0: raise Exception()
+                    break # no need to loop twice if we completed
+                except InferenceEndpointTimeoutError:
+                    if i < 3:
+                        print('Timed out, trying again...')
+                        args.model_config['model']['base_params']['reuse_existing'] = True
+                    # loop around and try again, for timeout
+        except Exception as ex: # if eval failed, we force a cleanup
+            traceback.print_exception(ex)
             try:
+                api.delete_inference_endpoint(
+                        name=args.model_config['model']['base_params']['endpoint_name'],
+                        namespace=args.model_config['model']['instance']['namespace']
+                )
+            except Exception as ex:
+                traceback.print_exception(ex)
+        if completed: break # no need to try with a different image version
     return results

src/custom_tasks/ilfacts_task.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+from aenum import extend_enum
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+from lighteval.utils import as_list
+def ilfacts_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    # do some santizations, since some models produce more info
+    pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
+    return 1 if pred == golds[0] else 0
+ilfacts_acc_metric = CorpusLevelMetric(
+    metric="ilfacts_acc",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    corpus_level_fn=np.mean,
+    sample_level_fn=ilfacts_eval_fn
+)
+extend_enum(Metrics, 'ilfacts_acc_metric', ilfacts_acc_metric)
+def ilfacts_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[resp.strip() for resp in line["response"]],
+        gold_index=0,
+        instruction="",
+    )
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+ilfacts_task = LightevalTaskConfig(
+    name="ilfacts-acc",
+    prompt_function="ilfacts_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["ilfacts"],
+    evaluation_splits=["ilfacts"],
+    metric=['ilfacts_acc_metric'],
+    stop_sequence=['\n'],
+    generation_size=48
+)