File size: 7,893 Bytes
9231ab9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
from typing import Any, Dict, List, Union
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
from .base import PIPELINE_INIT_ARGS, Pipeline
if is_vision_available():
from ..image_utils import load_image
if is_torch_available():
import torch
from ..models.auto.modeling_auto import (
MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
)
logger = logging.get_logger(__name__)
Prediction = Dict[str, Any]
Predictions = List[Prediction]
@add_end_docstrings(PIPELINE_INIT_ARGS)
class ObjectDetectionPipeline(Pipeline):
"""
Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
and their classes.
Example:
```python
>>> from transformers import pipeline
>>> detector = pipeline(model="facebook/detr-resnet-50")
>>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
>>> # x, y are expressed relative to the top left hand corner.
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"object-detection"`.
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.framework == "tf":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
requires_backends(self, "vision")
mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
self.check_model_type(mapping)
def _sanitize_parameters(self, **kwargs):
preprocess_params = {}
if "timeout" in kwargs:
preprocess_params["timeout"] = kwargs["timeout"]
postprocess_kwargs = {}
if "threshold" in kwargs:
postprocess_kwargs["threshold"] = kwargs["threshold"]
return preprocess_params, {}, postprocess_kwargs
def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
"""
Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing an HTTP(S) link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
same format: all as HTTP(S) links, all as local paths, or all as PIL images.
threshold (`float`, *optional*, defaults to 0.9):
The probability necessary to make a prediction.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
the call may block forever.
Return:
A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
image, will return a list of dictionaries, if the input is a list of several images, will return a list of
list of dictionaries corresponding to each image.
The dictionaries contain the following keys:
- **label** (`str`) -- The class label identified by the model.
- **score** (`float`) -- The score attributed by the model for that label.
- **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
"""
return super().__call__(*args, **kwargs)
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
target_size = torch.IntTensor([[image.height, image.width]])
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.tokenizer is not None:
inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
inputs["target_size"] = target_size
return inputs
def _forward(self, model_inputs):
target_size = model_inputs.pop("target_size")
outputs = self.model(**model_inputs)
model_outputs = outputs.__class__({"target_size": target_size, **outputs})
if self.tokenizer is not None:
model_outputs["bbox"] = model_inputs["bbox"]
return model_outputs
def postprocess(self, model_outputs, threshold=0.9):
target_size = model_outputs["target_size"]
if self.tokenizer is not None:
# This is a LayoutLMForTokenClassification variant.
# The OCR got the boxes and the model classified the words.
height, width = target_size[0].tolist()
def unnormalize(bbox):
return self._get_bounding_box(
torch.Tensor(
[
(width * bbox[0] / 1000),
(height * bbox[1] / 1000),
(width * bbox[2] / 1000),
(height * bbox[3] / 1000),
]
)
)
scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
keys = ["score", "label", "box"]
annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
else:
# This is a regular ForObjectDetectionModel
raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
raw_annotation = raw_annotations[0]
scores = raw_annotation["scores"]
labels = raw_annotation["labels"]
boxes = raw_annotation["boxes"]
raw_annotation["scores"] = scores.tolist()
raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
# {"scores": [...], ...} --> [{"score":x, ...}, ...]
keys = ["score", "label", "box"]
annotation = [
dict(zip(keys, vals))
for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
]
return annotation
def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
"""
Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
Args:
box (`torch.Tensor`): Tensor containing the coordinates in corners format.
Returns:
bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
"""
if self.framework != "pt":
raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
xmin, ymin, xmax, ymax = box.int().tolist()
bbox = {
"xmin": xmin,
"ymin": ymin,
"xmax": xmax,
"ymax": ymax,
}
return bbox
|