Spaces:
Running
Running
import json | |
import time | |
from datasets import Dataset | |
from requests_futures.sessions import FuturesSession | |
from transformers import AutoTokenizer | |
from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS, | |
MODEL_NAME) | |
RETURN_MESSAGE_SINGLE = """ | |
Inference statistics: | |
* Response status: {0} | |
* Prediction: {1} | |
* Inference latency (preprocessing/forward/postprocessing): {2} ms | |
* Peak GPU memory usage: {3} MB | |
* End-to-end latency (communication + pre/forward/post): {4} ms | |
* Padding ratio: 0.0 % | |
""" | |
RETURN_MESSAGE_SPAM = """ | |
Processing inputs sent asynchronously. Grab a coffee. | |
Inference statistics: | |
* Throughput: {0} samples/s | |
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms | |
* Mean peak GPU memory: {2} MB | |
* Mean padding ratio: {3} % | |
* Mean sequence length: {4} tokens | |
* Effective mean batch size: {5} | |
""" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
def get_message_single( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs | |
): | |
return RETURN_MESSAGE_SINGLE.format( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency | |
) | |
def get_message_spam( | |
throughput, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
effective_batch_size, | |
**kwargs, | |
): | |
return RETURN_MESSAGE_SPAM.format( | |
throughput, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
effective_batch_size, | |
) | |
SESSION = FuturesSession() | |
def send_single(input_model_vanilla: str, address: str): | |
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] | |
# should not take more than 10 s, so timeout if that's the case | |
inp = json.dumps({"text": input_model_vanilla, "pre_tokenized": False}).encode( | |
"utf-8" | |
) | |
start = time.time() | |
promise = SESSION.post(address, headers=HEADERS, data=inp, timeout=10) | |
try: | |
response = promise.result() # resolve ASAP | |
end = time.time() | |
except Exception as e: | |
return f"{e}" | |
status = response.status_code | |
response_text = json.loads(response.text) | |
prediction = response_text[0] | |
inf_latency = response_text[1] | |
peak_gpu_memory = response_text[2] | |
end_to_end_latency = round((end - start) * 1e3, 2) | |
return get_message_single( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency | |
) | |
def send_spam(inp: Dataset, address: str): | |
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] | |
mean_inference_latency = 0 | |
mean_peak_gpu_memory = 0 | |
n_pads = 0 | |
n_elems = 0 | |
sequence_length = 0 | |
effective_batch_size = 0 | |
promises = [] | |
n_inputs = len(inp) | |
start = time.time() | |
for i in range(n_inputs): | |
input_data = inp[i]["sentence"].encode("utf-8") | |
# should not take more than 15 s, so timeout if that's the case | |
promises.append( | |
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15) | |
) | |
# to measure throughput first | |
end = 0 | |
for promise in promises: | |
try: | |
response = promise.result() # resolve ASAP | |
except Exception as e: | |
return f"{e}" | |
end = max(time.time(), end) | |
# then other metrics | |
for promise in promises: | |
response = promise.result() | |
response_text = json.loads(response.text) | |
mean_inference_latency += response_text[1] | |
mean_peak_gpu_memory += response_text[2] | |
n_pads += response_text[3] | |
n_elems += response_text[4] | |
sequence_length += response_text[5] | |
effective_batch_size += response_text[6] | |
throughput = n_inputs / (end - start) | |
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}" | |
mean_sequence_length = sequence_length / n_inputs | |
effective_batch_size = effective_batch_size / n_inputs | |
throughput = round(throughput, 2) | |
mean_inference_latency = round(mean_inference_latency / n_inputs, 2) | |
mean_peak_gpu_memory = round(mean_peak_gpu_memory / n_inputs, 2) | |
return get_message_spam( | |
throughput, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
effective_batch_size, | |
) | |