In [None]:
%%bash
git clone https://github.com/philschmid/llmperf.git
cd llmperf
pip install -e . -q

# Setup

In [None]:
import sys
import json
from getpass import getpass
import subprocess
import os
from datetime import datetime
import pandas as pd
import numpy as np
from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami, get_inference_endpoint, get_token
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
notebook_login()

In [None]:
proj_dir = Path.cwd()
print(proj_dir)
LLMPerf_path = proj_dir/'llmperf'

# Config

In [None]:
# Endpoint
ENDPOINT_NAME="mixtral-exp"
NAMESPACE = 'HF-test-lab'
MODEL = 'TheBloke/mixtral-8x7b-v0.1-GPTQ'
INSTANCE_TYPE = 'nvidia-l4_AWQ'

# Simulation
RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE
tgi_bss = [1]
INPUT_TOKENS = 800
OUTPUT_TOKENS = 1600

# Endpoint setup

Be sure to configure your endpoint how you desire, I made some guesses on what you might want in the `env`. You can see some settings in the [pricing section](https://huggingface.co/docs/inference-endpoints/en/pricing#gpu-instances) of the docs. I would also recommend manually deploying once and using `get_inference_endpoint().__dict__` to double check your settings just to double check.

In [None]:
def create_endpoint(MAX_BATCH_SIZE, name, instance_type):
 try:
 endpoint = get_inference_endpoint(name=name, namespace=NAMESPACE)
 endpoint.wait()
 return endpoint
 except:
 pass
 try:
 endpoint = create_inference_endpoint(
 name,
 repository=MODEL,
 task="text-generation",
 framework="pytorch",
 region="us-east-1",
 vendor="aws",
 accelerator="gpu",
 instance_size="x4",
 instance_type='nvidia-l4',
 min_replica=0,
 max_replica=1,
 namespace=NAMESPACE,
 custom_image={
 "health_route": "/health",
 "env": {
 "MAX_INPUT_LENGTH": f"{INPUT_TOKENS+50}",
 "MAX_TOTAL_TOKENS": f"{INPUT_TOKENS + OUTPUT_TOKENS}",
 "MAX_BATCH_SIZE": f"{MAX_BATCH_SIZE}",
 "HF_TOKEN": get_token(),
 "QUANTIZE":"awq",
 "MODEL_ID": "/repository",
 },
 "url": "ghcr.io/huggingface/text-generation-inference:2.2.0",
 },
 type="protected",
 )
 endpoint.wait()
 except Exception as create_error:
 print(f"Failed to create inference endpoint: {str(create_error)}")
 return None

 return endpoint

Make sure to check the command to make sure it matches what you expect. Also check the summary stats json to see what actually happened.

In [None]:
def run_command(batch_size, endpoint, tgi_bs):
 prefix = f'tgibs_{tgi_bs}__bs_{batch_size}'
 vu = batch_size

 # Set environment variables
 env = os.environ.copy()
 env['HUGGINGFACE_API_BASE'] = endpoint.url
 env['HUGGINGFACE_API_TOKEN'] = get_token()
 env['MODEL_ID'] = MODEL
 # Convert pathlib.Path to string and append to PYTHONPATH
 env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))

 # Define the benchmark script path
 benchmark_script = str(LLMPerf_path / "token_benchmark_ray.py")

 if not os.path.isfile(benchmark_script):
 print(f"LLMPerf script not found at {benchmark_script}, please ensure the path is correct.")
 return "Script not found", False

 # Calculate the max number of completed requests
 max_requests = vu * 8

 # Generate the results directory name
 date_str = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
 results_dir = RESULTS_DIR / f"{date_str}_{prefix}"

 # Construct the command to run the benchmark script
 command = [
 "python", benchmark_script,
 "--model", f"{MODEL}",
 "--mean-input-tokens", f"{INPUT_TOKENS}",
 "--stddev-input-tokens", "10",
 "--mean-output-tokens", f"{OUTPUT_TOKENS}",
 "--stddev-output-tokens", "5",
 "--max-num-completed-requests", str(min(max_requests, 1500)),
 "--timeout", "7200",
 "--num-concurrent-requests", str(vu),
 "--results-dir", str(results_dir),
 "--llm-api", "huggingface",
 "--additional-sampling-params", '{}'
 ]

 # Run the command with the modified environment
 try:
 result = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env).decode('utf-8')
 return result, True
 except subprocess.CalledProcessError as e:
 print(f"Error with batch size {batch_size}: {e.output.decode()}")
 return e.output.decode(), False

def find_max_working_batch_size(endpoint, tgi_bs):
 batch_sizes = [8, 16, 32]
 max_working = None
 for size in tqdm(batch_sizes):
 tqdm.write(f"Running: TGIBS {tgi_bs} Client Requests {size}")
 output, success = run_command(size, endpoint, tgi_bs)
 if success:
 max_working = size
 else:
 break
 if max_working is None:
 return "No working batch size found in the provided list"
 return max_working

Here Im creating the endpoint and then running the simulation.

In [None]:
for tgi_bs in tqdm(tgi_bss):
 name = f"{ENDPOINT_NAME}--tgibs-{tgi_bs}"
 try:
 endpoint = get_inference_endpoint(name, namespace=NAMESPACE)
 except:
 endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) 
 pass
 endpoint.wait()
 tqdm.write(f"Endpoint Created: {name}")
 max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)
 endpoint.delete()
 tqdm.write(f"Endpoint Deleted: {name}")