| |
| |
|
|
| import os |
| import json |
| from typing import Any |
| import numpy as np |
| import random |
| import torch |
| import torchvision |
| import torchvision.transforms as transforms |
| from PIL import Image |
| import cv2 |
| import matplotlib.pyplot as plt |
| from cog import BasePredictor, Input, Path, BaseModel |
|
|
| from subprocess import call |
|
|
| HOME = os.getcwd() |
| os.chdir("GroundingDINO") |
| call("pip install -q .", shell=True) |
| os.chdir(HOME) |
| os.chdir("segment_anything") |
| call("pip install -q .", shell=True) |
| os.chdir(HOME) |
|
|
| |
| import GroundingDINO.groundingdino.datasets.transforms as T |
| from GroundingDINO.groundingdino.models import build_model |
| from GroundingDINO.groundingdino.util.slconfig import SLConfig |
| from GroundingDINO.groundingdino.util.utils import ( |
| clean_state_dict, |
| get_phrases_from_posmap, |
| ) |
|
|
| |
| from segment_anything import build_sam, build_sam_hq, SamPredictor |
|
|
| from ram.models import ram |
|
|
|
|
| class ModelOutput(BaseModel): |
| tags: str |
| rounding_box_img: Path |
| masked_img: Path |
| json_data: Any |
|
|
|
|
| class Predictor(BasePredictor): |
| def setup(self): |
| """Load the model into memory to make running multiple predictions efficient""" |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| normalize = transforms.Normalize( |
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
| ) |
| self.image_size = 384 |
| self.transform = transforms.Compose( |
| [ |
| transforms.Resize((self.image_size, self.image_size)), |
| transforms.ToTensor(), |
| normalize, |
| ] |
| ) |
|
|
| |
| self.ram_model = ram( |
| pretrained="pretrained/ram_swin_large_14m.pth", |
| image_size=self.image_size, |
| vit="swin_l", |
| ) |
| self.ram_model.eval() |
| self.ram_model = self.ram_model.to(self.device) |
|
|
| self.model = load_model( |
| "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", |
| "pretrained/groundingdino_swint_ogc.pth", |
| device=self.device, |
| ) |
|
|
| self.sam = SamPredictor( |
| build_sam(checkpoint="pretrained/sam_vit_h_4b8939.pth").to(self.device) |
| ) |
| self.sam_hq = SamPredictor( |
| build_sam_hq(checkpoint="pretrained/sam_hq_vit_h.pth").to(self.device) |
| ) |
|
|
| def predict( |
| self, |
| input_image: Path = Input(description="Input image"), |
| use_sam_hq: bool = Input( |
| description="Use sam_hq instead of SAM for prediction", default=False |
| ), |
| ) -> ModelOutput: |
| """Run a single prediction on the model""" |
|
|
| |
| box_threshold = 0.25 |
| text_threshold = 0.2 |
| iou_threshold = 0.5 |
|
|
| image_pil, image = load_image(str(input_image)) |
|
|
| raw_image = image_pil.resize((self.image_size, self.image_size)) |
| raw_image = self.transform(raw_image).unsqueeze(0).to(self.device) |
|
|
| with torch.no_grad(): |
| tags, tags_chinese = self.ram_model.generate_tag(raw_image) |
|
|
| tags = tags[0].replace(" |", ",") |
|
|
| |
| boxes_filt, scores, pred_phrases = get_grounding_output( |
| self.model, image, tags, box_threshold, text_threshold, device=self.device |
| ) |
|
|
| predictor = self.sam_hq if use_sam_hq else self.sam |
|
|
| image = cv2.imread(str(input_image)) |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
| predictor.set_image(image) |
|
|
| size = image_pil.size |
| H, W = size[1], size[0] |
| for i in range(boxes_filt.size(0)): |
| boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) |
| boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 |
| boxes_filt[i][2:] += boxes_filt[i][:2] |
|
|
| boxes_filt = boxes_filt.cpu() |
| |
| print(f"Before NMS: {boxes_filt.shape[0]} boxes") |
| nms_idx = ( |
| torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist() |
| ) |
| boxes_filt = boxes_filt[nms_idx] |
| pred_phrases = [pred_phrases[idx] for idx in nms_idx] |
| print(f"After NMS: {boxes_filt.shape[0]} boxes") |
|
|
| transformed_boxes = predictor.transform.apply_boxes_torch( |
| boxes_filt, image.shape[:2] |
| ).to(self.device) |
|
|
| masks, _, _ = predictor.predict_torch( |
| point_coords=None, |
| point_labels=None, |
| boxes=transformed_boxes.to(self.device), |
| multimask_output=False, |
| ) |
|
|
| |
| plt.figure(figsize=(10, 10)) |
| for mask in masks: |
| show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) |
| for box, label in zip(boxes_filt, pred_phrases): |
| show_box(box.numpy(), plt.gca(), label) |
|
|
| rounding_box_path = "/tmp/automatic_label_output.png" |
| plt.axis("off") |
| plt.savefig( |
| Path(rounding_box_path), bbox_inches="tight", dpi=300, pad_inches=0.0 |
| ) |
| plt.close() |
|
|
| |
| value = 0 |
| mask_img = torch.zeros(masks.shape[-2:]) |
| for idx, mask in enumerate(masks): |
| mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1 |
| plt.figure(figsize=(10, 10)) |
| plt.imshow(mask_img.numpy()) |
| plt.axis("off") |
| masks_path = "/tmp/mask.png" |
| plt.savefig(masks_path, bbox_inches="tight", dpi=300, pad_inches=0.0) |
| plt.close() |
|
|
| json_data = { |
| "tags": tags, |
| "mask": [{"value": value, "label": "background"}], |
| } |
| for label, box in zip(pred_phrases, boxes_filt): |
| value += 1 |
| name, logit = label.split("(") |
| logit = logit[:-1] |
| json_data["mask"].append( |
| { |
| "value": value, |
| "label": name, |
| "logit": float(logit), |
| "box": box.numpy().tolist(), |
| } |
| ) |
|
|
| json_path = "/tmp/label.json" |
| with open(json_path, "w") as f: |
| json.dump(json_data, f) |
|
|
| return ModelOutput( |
| tags=tags, |
| masked_img=Path(masks_path), |
| rounding_box_img=Path(rounding_box_path), |
| json_data=Path(json_path), |
| ) |
|
|
|
|
| def get_grounding_output( |
| model, image, caption, box_threshold, text_threshold, device="cpu" |
| ): |
| caption = caption.lower() |
| caption = caption.strip() |
| if not caption.endswith("."): |
| caption = caption + "." |
| model = model.to(device) |
| image = image.to(device) |
| with torch.no_grad(): |
| outputs = model(image[None], captions=[caption]) |
| logits = outputs["pred_logits"].cpu().sigmoid()[0] |
| boxes = outputs["pred_boxes"].cpu()[0] |
| logits.shape[0] |
|
|
| |
| logits_filt = logits.clone() |
| boxes_filt = boxes.clone() |
| filt_mask = logits_filt.max(dim=1)[0] > box_threshold |
| logits_filt = logits_filt[filt_mask] |
| boxes_filt = boxes_filt[filt_mask] |
| logits_filt.shape[0] |
|
|
| |
| tokenlizer = model.tokenizer |
| tokenized = tokenlizer(caption) |
| |
| pred_phrases = [] |
| scores = [] |
| for logit, box in zip(logits_filt, boxes_filt): |
| pred_phrase = get_phrases_from_posmap( |
| logit > text_threshold, tokenized, tokenlizer |
| ) |
| pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") |
| scores.append(logit.max().item()) |
|
|
| return boxes_filt, torch.Tensor(scores), pred_phrases |
|
|
|
|
| def load_image(image_path): |
| |
| image_pil = Image.open(image_path).convert("RGB") |
|
|
| transform = T.Compose( |
| [ |
| T.RandomResize([800], max_size=1333), |
| T.ToTensor(), |
| T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
| ] |
| ) |
| image, _ = transform(image_pil, None) |
| return image_pil, image |
|
|
|
|
| def load_model(model_config_path, model_checkpoint_path, device): |
| args = SLConfig.fromfile(model_config_path) |
| args.device = device |
| model = build_model(args) |
| checkpoint = torch.load(model_checkpoint_path, map_location="cpu") |
| load_res = model.load_state_dict( |
| clean_state_dict(checkpoint["model"]), strict=False |
| ) |
| print(load_res) |
| _ = model.eval() |
| return model |
|
|
|
|
| def show_mask(mask, ax, random_color=False): |
| if random_color: |
| color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) |
| else: |
| color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) |
| h, w = mask.shape[-2:] |
| mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) |
| ax.imshow(mask_image) |
|
|
|
|
| def show_box(box, ax, label): |
| x0, y0 = box[0], box[1] |
| w, h = box[2] - box[0], box[3] - box[1] |
| ax.add_patch( |
| plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=1.5) |
| ) |
| ax.text(x0, y0, label) |
|
|