YOLO-NAS-Pose-JetPack5 / yolo_nas_pose_to_onnx.py
Luigi's picture
Limit opset version to its minimum, 14
097b358
raw
history blame
7.45 kB
#! /usr/bin/python3
from termcolor import cprint, colored
from super_gradients.common.object_names import Models
from super_gradients.training import models
from super_gradients.conversion import ExportTargetBackend, ExportQuantizationMode, DetectionOutputFormatMode
import time
import cv2
import numpy as np
from super_gradients.training.utils.media.image import load_image
import onnxruntime
import os
from super_gradients.training.utils.visualization.pose_estimation import PoseVisualization
import matplotlib.pyplot as plt
from datasets import load_dataset
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import matplotlib.pyplot as plt
os.environ['CRASH_HANDLER']='0'
# Conversion Setting
CONVERSION = True
input_image_shape = [640, 640]
quantization_modes = [ExportQuantizationMode.INT8, ExportQuantizationMode.FP16, None]
output_predictions_format=DetectionOutputFormatMode.FLAT_FORMAT
# NMS-related Setting
confidence_threshold=.15
nms_threshold=.2
num_pre_nms_predictions=1000
max_predictions_per_image=10
# ONNXruntime Benchmark Setting
BENCHMARK=True
n_run = 1000
n_warm_up = 200
image_name = "https://deci-pretrained-models.s3.amazonaws.com/sample_images/beatles-abbeyroad.jpg"
# Check
SHAPE_CHECK=True
VISUAL_CHECK=True
CALIBRATION_DATASET_CHECK=False
# Function to convert tensor to image for visualization
def tensor_to_image(tensor):
# Convert the tensor to a numpy array
numpy_image = tensor.numpy()
# The output of ToTensor() is in C x H x W format, convert to H x W x C
numpy_image = numpy_image.transpose(1, 2, 0)
# Undo the normalization (if any)
# numpy_image = numpy_image * std + mean # Adjust based on your normalization
return numpy_image
class HFDatasetWrapper(Dataset):
def __init__(self, hf_dataset, transform=None):
self.hf_dataset = hf_dataset
self.transform = transform
def __len__(self):
return len(self.hf_dataset)
def __getitem__(self, idx):
item = self.hf_dataset[idx]
if self.transform:
item = self.transform(item)
return item['image']
def preprocess(data):
# Convert byte data to PIL Image
image = data['image']
# Convert to RGB if not already
if image.mode != 'RGB':
image = image.convert('RGB')
# Define your transformations
transform = transforms.Compose([
transforms.Resize((640, 640)), # Resize (example size)
transforms.ToTensor(), # Convert to tensor
# Add normalization or other transformations if needed
])
# Process Image
transformed = transform(image)
if CALIBRATION_DATASET_CHECK:
# Display the Processed Image
plt_image = tensor_to_image(transformed)
plt.imshow(plt_image)
plt.axis('off') # Turn off axis numbers
plt.show()
return {'image': transformed}
def iterate_over_flat_predictions(predictions, batch_size):
[flat_predictions] = predictions
for image_index in range(batch_size):
mask = flat_predictions[:, 0] == image_index
pred_bboxes = flat_predictions[mask, 1:5]
pred_scores = flat_predictions[mask, 5]
pred_joints = flat_predictions[mask, 6:].reshape((len(pred_bboxes), -1, 3))
yield image_index, pred_bboxes, pred_scores, pred_joints
def show_predictions_from_flat_format(image, predictions):
image_index, pred_boxes, pred_scores, pred_joints = next(iter(iterate_over_flat_predictions(predictions, 1)))
image = PoseVisualization.draw_poses(
image=image, poses=pred_joints, scores=pred_scores, boxes=pred_boxes,
edge_links=None, edge_colors=None, keypoint_colors=None, is_crowd=None
)
plt.figure(figsize=(8, 8))
plt.imshow(image)
plt.tight_layout()
plt.show()
image = load_image(image_name)
image = cv2.resize(image, (input_image_shape[1], input_image_shape[0]))
image_bchw = np.transpose(np.expand_dims(image, 0), (0, 3, 1, 2))
# Prepare Calibration Dataset for INT8 Quantization
dataset = load_dataset("cppe-5", split="train")
hf_dataset_wrapper = HFDatasetWrapper(dataset, transform=preprocess)
calibration_loader = DataLoader(hf_dataset_wrapper, batch_size=8)
for model_name in [Models.YOLO_NAS_POSE_L, Models.YOLO_NAS_POSE_M, Models.YOLO_NAS_POSE_N, Models.YOLO_NAS_POSE_S ]:
for q in quantization_modes:
# Specify Quantization Mode in Exported ONNX Model Name
if q == None:
q_label = 'fp32'
elif q == ExportQuantizationMode.INT8:
q_label = 'int8'
elif q == ExportQuantizationMode.FP16:
q_label = 'fp16'
else:
raise
export_name = f"{model_name}_{q_label}.onnx"
# Perform Model Conversion from PyTorch to ONNX using Super-Gradiant Official Method
print(f"1. Convert {colored(model_name,'blue')} from PyTorch to ONNX format using {colored(q_label,'red')} precision, saved as {colored(export_name,'green')}")
if CONVERSION:
model = models.get(model_name, pretrained_weights="coco_pose")
export_result = model.export(
output=export_name,
confidence_threshold=confidence_threshold,
nms_threshold=nms_threshold,
engine=ExportTargetBackend.ONNXRUNTIME,
quantization_mode=q,
#selective_quantizer: Optional["SelectiveQuantizer"] = None, # noqa
calibration_loader = calibration_loader,
#calibration_method: str = "percentile",
#calibration_batches: int = 16,
#calibration_percentile: float = 99.99,
preprocessing=True,
postprocessing=True,
#postprocessing_kwargs: Optional[dict] = None,
batch_size=1,
input_image_shape=input_image_shape,
#input_image_channels: Optional[int] = None,
#input_image_dtype: Optional[torch.dtype] = None,
max_predictions_per_image=max_predictions_per_image,
onnx_export_kwargs={"opset_version":14},
onnx_simplify=True,
#device: Optional[Union[torch.device, str]] = None,
output_predictions_format=output_predictions_format,
num_pre_nms_predictions=num_pre_nms_predictions,
)
# Export Also Model Usage in Text
usage_name = export_name + '.usage.txt'
with open(usage_name, 'w') as f:
f.write(str(export_result))
print(f"1.1 Related usage to {colored(export_name, 'green')} has been stored to {colored(usage_name,'yellow')}")
if BENCHMARK:
# Perform Inference on ONNXruntime
session = onnxruntime.InferenceSession(export_name, providers=['CUDAExecutionProvider',"CPUExecutionProvider"])
inputs = [o.name for o in session.get_inputs()]
outputs = [o.name for o in session.get_outputs()]
# Detection Result Shape
for i in range(n_warm_up): result = session.run(outputs, {inputs[0]: image_bchw})
t=time.time()
for i in range(n_run): result = session.run(outputs, {inputs[0]: image_bchw})
latency=(time.time()-t)/n_run
fps = round(1/latency,2)
print(f'2. Averaged FPS: {colored(fps, "red")}')
if SHAPE_CHECK:
for image_index, pred_bboxes, pred_scores, pred_joints in iterate_over_flat_predictions(result, batch_size=1):
N = pred_scores.shape[0]
for i in range(N):
print(f'Detected Object {colored(i,"green")}')
print(f'Predicted Bounding Box (Dimension: 1 x 4)', pred_bboxes[i,:])
print(f'Pose Confidence (scalar)', pred_scores[i])
print(f'Predicted Joints (Dimension: 3 x 17)', pred_joints[i,:,:])
if VISUAL_CHECK:
# Detection Result Visual Check
show_predictions_from_flat_format(image, result)