vierundvi / predict.py

2cd560a 10 months ago

9.13 kB

	# Prediction interface for Cog ⚙️
	# https://github.com/replicate/cog/blob/main/docs/python.md

	import os
	import json
	from typing import Any
	import numpy as np
	import random
	import torch
	import torchvision
	import torchvision.transforms as transforms
	from PIL import Image
	import cv2
	import matplotlib.pyplot as plt
	from cog import BasePredictor, Input, Path, BaseModel

	from subprocess import call

	HOME = os.getcwd()
	os.chdir("GroundingDINO")
	call("pip install -q .", shell=True)
	os.chdir(HOME)
	os.chdir("segment_anything")
	call("pip install -q .", shell=True)
	os.chdir(HOME)

	# Grounding DINO
	import GroundingDINO.groundingdino.datasets.transforms as T
	from GroundingDINO.groundingdino.models import build_model
	from GroundingDINO.groundingdino.util.slconfig import SLConfig
	from GroundingDINO.groundingdino.util.utils import (
	clean_state_dict,
	get_phrases_from_posmap,
	)

	# segment anything
	from segment_anything import build_sam, build_sam_hq, SamPredictor

	from ram.models import ram


	class ModelOutput(BaseModel):
	tags: str
	rounding_box_img: Path
	masked_img: Path
	json_data: Any


	class Predictor(BasePredictor):
	def setup(self):
	"""Load the model into memory to make running multiple predictions efficient"""
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	normalize = transforms.Normalize(
	mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
	)
	self.image_size = 384
	self.transform = transforms.Compose(
	[
	transforms.Resize((self.image_size, self.image_size)),
	transforms.ToTensor(),
	normalize,
	]
	)

	# load model
	self.ram_model = ram(
	pretrained="pretrained/ram_swin_large_14m.pth",
	image_size=self.image_size,
	vit="swin_l",
	)
	self.ram_model.eval()
	self.ram_model = self.ram_model.to(self.device)

	self.model = load_model(
	"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
	"pretrained/groundingdino_swint_ogc.pth",
	device=self.device,
	)

	self.sam = SamPredictor(
	build_sam(checkpoint="pretrained/sam_vit_h_4b8939.pth").to(self.device)
	)
	self.sam_hq = SamPredictor(
	build_sam_hq(checkpoint="pretrained/sam_hq_vit_h.pth").to(self.device)
	)

	def predict(
	self,
	input_image: Path = Input(description="Input image"),
	use_sam_hq: bool = Input(
	description="Use sam_hq instead of SAM for prediction", default=False
	),
	) -> ModelOutput:
	"""Run a single prediction on the model"""

	# default settings
	box_threshold = 0.25
	text_threshold = 0.2
	iou_threshold = 0.5

	image_pil, image = load_image(str(input_image))

	raw_image = image_pil.resize((self.image_size, self.image_size))
	raw_image = self.transform(raw_image).unsqueeze(0).to(self.device)

	with torch.no_grad():
	tags, tags_chinese = self.ram_model.generate_tag(raw_image)

	tags = tags[0].replace(" \|", ",")

	# run grounding dino model
	boxes_filt, scores, pred_phrases = get_grounding_output(
	self.model, image, tags, box_threshold, text_threshold, device=self.device
	)

	predictor = self.sam_hq if use_sam_hq else self.sam

	image = cv2.imread(str(input_image))
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	predictor.set_image(image)

	size = image_pil.size
	H, W = size[1], size[0]
	for i in range(boxes_filt.size(0)):
	boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
	boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
	boxes_filt[i][2:] += boxes_filt[i][:2]

	boxes_filt = boxes_filt.cpu()
	# use NMS to handle overlapped boxes
	print(f"Before NMS: {boxes_filt.shape[0]} boxes")
	nms_idx = (
	torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
	)
	boxes_filt = boxes_filt[nms_idx]
	pred_phrases = [pred_phrases[idx] for idx in nms_idx]
	print(f"After NMS: {boxes_filt.shape[0]} boxes")

	transformed_boxes = predictor.transform.apply_boxes_torch(
	boxes_filt, image.shape[:2]
	).to(self.device)

	masks, _, _ = predictor.predict_torch(
	point_coords=None,
	point_labels=None,
	boxes=transformed_boxes.to(self.device),
	multimask_output=False,
	)

	# draw output image
	plt.figure(figsize=(10, 10))
	for mask in masks:
	show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
	for box, label in zip(boxes_filt, pred_phrases):
	show_box(box.numpy(), plt.gca(), label)

	rounding_box_path = "/tmp/automatic_label_output.png"
	plt.axis("off")
	plt.savefig(
	Path(rounding_box_path), bbox_inches="tight", dpi=300, pad_inches=0.0
	)
	plt.close()

	# save masks and json data
	value = 0 # 0 for background
	mask_img = torch.zeros(masks.shape[-2:])
	for idx, mask in enumerate(masks):
	mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
	plt.figure(figsize=(10, 10))
	plt.imshow(mask_img.numpy())
	plt.axis("off")
	masks_path = "/tmp/mask.png"
	plt.savefig(masks_path, bbox_inches="tight", dpi=300, pad_inches=0.0)
	plt.close()

	json_data = {
	"tags": tags,
	"mask": [{"value": value, "label": "background"}],
	}
	for label, box in zip(pred_phrases, boxes_filt):
	value += 1
	name, logit = label.split("(")
	logit = logit[:-1] # the last is ')'
	json_data["mask"].append(
	{
	"value": value,
	"label": name,
	"logit": float(logit),
	"box": box.numpy().tolist(),
	}
	)

	json_path = "/tmp/label.json"
	with open(json_path, "w") as f:
	json.dump(json_data, f)

	return ModelOutput(
	tags=tags,
	masked_img=Path(masks_path),
	rounding_box_img=Path(rounding_box_path),
	json_data=Path(json_path),
	)


	def get_grounding_output(
	model, image, caption, box_threshold, text_threshold, device="cpu"
	):
	caption = caption.lower()
	caption = caption.strip()
	if not caption.endswith("."):
	caption = caption + "."
	model = model.to(device)
	image = image.to(device)
	with torch.no_grad():
	outputs = model(image[None], captions=[caption])
	logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
	boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
	logits.shape[0]

	# filter output
	logits_filt = logits.clone()
	boxes_filt = boxes.clone()
	filt_mask = logits_filt.max(dim=1)[0] > box_threshold
	logits_filt = logits_filt[filt_mask] # num_filt, 256
	boxes_filt = boxes_filt[filt_mask] # num_filt, 4
	logits_filt.shape[0]

	# get phrase
	tokenlizer = model.tokenizer
	tokenized = tokenlizer(caption)
	# build pred
	pred_phrases = []
	scores = []
	for logit, box in zip(logits_filt, boxes_filt):
	pred_phrase = get_phrases_from_posmap(
	logit > text_threshold, tokenized, tokenlizer
	)
	pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
	scores.append(logit.max().item())

	return boxes_filt, torch.Tensor(scores), pred_phrases


	def load_image(image_path):
	# load image
	image_pil = Image.open(image_path).convert("RGB") # load image

	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	image, _ = transform(image_pil, None) # 3, h, w
	return image_pil, image


	def load_model(model_config_path, model_checkpoint_path, device):
	args = SLConfig.fromfile(model_config_path)
	args.device = device
	model = build_model(args)
	checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
	load_res = model.load_state_dict(
	clean_state_dict(checkpoint["model"]), strict=False
	)
	print(load_res)
	_ = model.eval()
	return model


	def show_mask(mask, ax, random_color=False):
	if random_color:
	color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
	else:
	color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
	h, w = mask.shape[-2:]
	mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
	ax.imshow(mask_image)


	def show_box(box, ax, label):
	x0, y0 = box[0], box[1]
	w, h = box[2] - box[0], box[3] - box[1]
	ax.add_patch(
	plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=1.5)
	)
	ax.text(x0, y0, label)