#! /usr/bin/python3 from termcolor import cprint, colored from super_gradients.common.object_names import Models from super_gradients.training import models from super_gradients.conversion import ExportTargetBackend, ExportQuantizationMode, DetectionOutputFormatMode import time import cv2 import numpy as np from super_gradients.training.utils.media.image import load_image import onnxruntime import os from super_gradients.training.utils.visualization.pose_estimation import PoseVisualization import matplotlib.pyplot as plt from datasets import load_dataset from torchvision import transforms from torch.utils.data import DataLoader, Dataset from torchvision import transforms import matplotlib.pyplot as plt os.environ['CRASH_HANDLER']='0' # Conversion Setting CONVERSION = True input_image_shape = [640, 640] quantization_modes = [ExportQuantizationMode.INT8, ExportQuantizationMode.FP16, None] output_predictions_format=DetectionOutputFormatMode.FLAT_FORMAT # NMS-related Setting confidence_threshold=.15 nms_threshold=.2 num_pre_nms_predictions=1000 max_predictions_per_image=10 # ONNXruntime Benchmark Setting BENCHMARK=True n_run = 1000 n_warm_up = 200 image_name = "https://deci-pretrained-models.s3.amazonaws.com/sample_images/beatles-abbeyroad.jpg" # Check SHAPE_CHECK=True VISUAL_CHECK=True CALIBRATION_DATASET_CHECK=False # Function to convert tensor to image for visualization def tensor_to_image(tensor): # Convert the tensor to a numpy array numpy_image = tensor.numpy() # The output of ToTensor() is in C x H x W format, convert to H x W x C numpy_image = numpy_image.transpose(1, 2, 0) # Undo the normalization (if any) # numpy_image = numpy_image * std + mean # Adjust based on your normalization return numpy_image class HFDatasetWrapper(Dataset): def __init__(self, hf_dataset, transform=None): self.hf_dataset = hf_dataset self.transform = transform def __len__(self): return len(self.hf_dataset) def __getitem__(self, idx): item = self.hf_dataset[idx] if self.transform: item = self.transform(item) return item['image'] def preprocess(data): # Convert byte data to PIL Image image = data['image'] # Convert to RGB if not already if image.mode != 'RGB': image = image.convert('RGB') # Define your transformations transform = transforms.Compose([ transforms.Resize((640, 640)), # Resize (example size) transforms.ToTensor(), # Convert to tensor # Add normalization or other transformations if needed ]) # Process Image transformed = transform(image) if CALIBRATION_DATASET_CHECK: # Display the Processed Image plt_image = tensor_to_image(transformed) plt.imshow(plt_image) plt.axis('off') # Turn off axis numbers plt.show() return {'image': transformed} def iterate_over_flat_predictions(predictions, batch_size): [flat_predictions] = predictions for image_index in range(batch_size): mask = flat_predictions[:, 0] == image_index pred_bboxes = flat_predictions[mask, 1:5] pred_scores = flat_predictions[mask, 5] pred_joints = flat_predictions[mask, 6:].reshape((len(pred_bboxes), -1, 3)) yield image_index, pred_bboxes, pred_scores, pred_joints def show_predictions_from_flat_format(image, predictions): image_index, pred_boxes, pred_scores, pred_joints = next(iter(iterate_over_flat_predictions(predictions, 1))) image = PoseVisualization.draw_poses( image=image, poses=pred_joints, scores=pred_scores, boxes=pred_boxes, edge_links=None, edge_colors=None, keypoint_colors=None, is_crowd=None ) plt.figure(figsize=(8, 8)) plt.imshow(image) plt.tight_layout() plt.show() image = load_image(image_name) image = cv2.resize(image, (input_image_shape[1], input_image_shape[0])) image_bchw = np.transpose(np.expand_dims(image, 0), (0, 3, 1, 2)) # Prepare Calibration Dataset for INT8 Quantization dataset = load_dataset("cppe-5", split="train") hf_dataset_wrapper = HFDatasetWrapper(dataset, transform=preprocess) calibration_loader = DataLoader(hf_dataset_wrapper, batch_size=8) for model_name in [Models.YOLO_NAS_POSE_L, Models.YOLO_NAS_POSE_M, Models.YOLO_NAS_POSE_N, Models.YOLO_NAS_POSE_S ]: for q in quantization_modes: # Specify Quantization Mode in Exported ONNX Model Name if q == None: q_label = 'fp32' elif q == ExportQuantizationMode.INT8: q_label = 'int8' elif q == ExportQuantizationMode.FP16: q_label = 'fp16' else: raise export_name = f"{model_name}_{q_label}.onnx" # Perform Model Conversion from PyTorch to ONNX using Super-Gradiant Official Method print(f"1. Convert {colored(model_name,'blue')} from PyTorch to ONNX format using {colored(q_label,'red')} precision, saved as {colored(export_name,'green')}") if CONVERSION: model = models.get(model_name, pretrained_weights="coco_pose") export_result = model.export( output=export_name, confidence_threshold=confidence_threshold, nms_threshold=nms_threshold, engine=ExportTargetBackend.ONNXRUNTIME, quantization_mode=q, #selective_quantizer: Optional["SelectiveQuantizer"] = None, # noqa calibration_loader = calibration_loader if q == ExportQuantizationMode.INT8 else None, #calibration_method: str = "percentile", #calibration_batches: int = 16, #calibration_percentile: float = 99.99, preprocessing=True, postprocessing=True, #postprocessing_kwargs: Optional[dict] = None, batch_size=1, input_image_shape=input_image_shape, #input_image_channels: Optional[int] = None, #input_image_dtype: Optional[torch.dtype] = None, max_predictions_per_image=max_predictions_per_image, onnx_export_kwargs={"opset_version":14}, onnx_simplify=True, #device: Optional[Union[torch.device, str]] = None, output_predictions_format=output_predictions_format, num_pre_nms_predictions=num_pre_nms_predictions, ) # Export Also Model Usage in Text usage_name = export_name + '.usage.txt' with open(usage_name, 'w') as f: f.write(str(export_result)) print(f"1.1 Related usage to {colored(export_name, 'green')} has been stored to {colored(usage_name,'yellow')}") if BENCHMARK: # Perform Inference on ONNXruntime session = onnxruntime.InferenceSession(export_name, providers=['CUDAExecutionProvider',"CPUExecutionProvider"]) inputs = [o.name for o in session.get_inputs()] outputs = [o.name for o in session.get_outputs()] # Detection Result Shape for i in range(n_warm_up): result = session.run(outputs, {inputs[0]: image_bchw}) t=time.time() for i in range(n_run): result = session.run(outputs, {inputs[0]: image_bchw}) latency=(time.time()-t)/n_run fps = round(1/latency,2) print(f'2. Averaged FPS: {colored(fps, "red")}') if SHAPE_CHECK: for image_index, pred_bboxes, pred_scores, pred_joints in iterate_over_flat_predictions(result, batch_size=1): N = pred_scores.shape[0] for i in range(N): print(f'Detected Object {colored(i,"green")}') print(f'Predicted Bounding Box (Dimension: 1 x 4)', pred_bboxes[i,:]) print(f'Pose Confidence (scalar)', pred_scores[i]) print(f'Predicted Joints (Dimension: 3 x 17)', pred_joints[i,:,:]) if VISUAL_CHECK: # Detection Result Visual Check show_predictions_from_flat_format(image, result)