diff --git a/datasets/coco.py b/datasets/coco.py new file mode 100644 index 00000000..190c26cb --- /dev/null +++ b/datasets/coco.py @@ -0,0 +1,200 @@ +import json +import pathlib +import zipfile +import numpy as np +from extra.utils import download_file +import pycocotools._mask as _mask +from examples.mask_rcnn import Masker +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval + +iou = _mask.iou +merge = _mask.merge +frPyObjects = _mask.frPyObjects + +BASEDIR = pathlib.Path(__file__).parent.parent / "datasets/COCO" + +def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows} + + +if not pathlib.Path(BASEDIR/'val2017').is_dir(): + fn = BASEDIR/'val2017.zip' + download_file('http://images.cocodataset.org/zips/val2017.zip',fn) + with zipfile.ZipFile(fn, 'r') as zip_ref: + zip_ref.extractall(BASEDIR) + fn.unlink() + + +if not pathlib.Path(BASEDIR/'annotations').is_dir(): + fn = BASEDIR/'annotations_trainval2017.zip' + download_file('http://images.cocodataset.org/annotations/annotations_trainval2017.zip',fn) + with zipfile.ZipFile(fn, 'r') as zip_ref: + zip_ref.extractall(BASEDIR) + fn.unlink() + +with open(BASEDIR/'annotations/instances_val2017.json', 'r') as f: + annotations_raw = json.loads(f.read()) +images = annotations_raw['images'] +categories = annotations_raw['categories'] +annotations = annotations_raw['annotations'] +file_name_to_id = create_dict('file_name', 'id', images) +id_to_width = create_dict('id', 'width', images) +id_to_height = create_dict('id', 'height', images) +json_category_id_to_contiguous_id = {v['id']: i + 1 for i, v in enumerate(categories)} +contiguous_category_id_to_json_id = {v:k for k,v in json_category_id_to_contiguous_id.items()} + + +def encode(bimask): + if len(bimask.shape) == 3: + return _mask.encode(bimask) + elif len(bimask.shape) == 2: + h, w = bimask.shape + return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] + +def decode(rleObjs): + if type(rleObjs) == list: + return _mask.decode(rleObjs) + else: + return _mask.decode([rleObjs])[:,:,0] + +def area(rleObjs): + if type(rleObjs) == list: + return _mask.area(rleObjs) + else: + return _mask.area([rleObjs])[0] + +def toBbox(rleObjs): + if type(rleObjs) == list: + return _mask.toBbox(rleObjs) + else: + return _mask.toBbox([rleObjs])[0] + + +def convert_prediction_to_coco_bbox(file_name, prediction): + coco_results = [] + try: + original_id = file_name_to_id[file_name] + if len(prediction) == 0: + return coco_results + + image_width = id_to_width[original_id] + image_height = id_to_height[original_id] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert("xywh") + + boxes = prediction.bbox.numpy().tolist() + scores = prediction.get_field("scores").numpy().tolist() + labels = prediction.get_field("labels").numpy().tolist() + + mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + except Exception as e: + print(file_name, e) + return coco_results + +masker = Masker(threshold=0.5, padding=1) + +def convert_prediction_to_coco_mask(file_name, prediction): + coco_results = [] + try: + original_id = file_name_to_id[file_name] + if len(prediction) == 0: + return coco_results + + image_width = id_to_width[original_id] + image_height = id_to_height[original_id] + prediction = prediction.resize((image_width, image_height)) + masks = prediction.get_field("mask") + + scores = prediction.get_field("scores").numpy().tolist() + labels = prediction.get_field("labels").numpy().tolist() + + masks = masker([masks], [prediction])[0].numpy() + + rles = [ + encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + except Exception as e: + print(file_name, e) + return coco_results + + + +def accumulate_predictions_for_coco(coco_results, json_result_file, rm=False): + path = pathlib.Path(json_result_file) + if rm and path.exists(): path.unlink() + with open(path, "a") as f: + for s in coco_results: + f.write(json.dumps(s)) + f.write('\n') + +def remove_dup(l): + seen = set() + seen_add = seen.add + return [x for x in l if not (x in seen or seen_add(x))] + +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super(NpEncoder, self).default(obj) + + +def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"): + coco_results = [] + with open(json_result_file, "r") as f: + for line in f: + coco_results.append(json.loads(line)) + + coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json')) + set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results]) + unique_list = [json.loads(s) for s in set_of_json] + + with open(f'{json_result_file}.flattend', "w") as f: + json.dump(unique_list, f) + + coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend')) + coco_eval = COCOeval(coco_gt, coco_dt, iou_type) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval + +def iterate(files, bs=1): + batch = [] + for file in files: + batch.append(file) + if len(batch) >= bs: yield batch; batch = [] + if len(batch) > 0: yield batch; batch = [] diff --git a/examples/mask_rcnn.py b/examples/mask_rcnn.py new file mode 100644 index 00000000..66a57275 --- /dev/null +++ b/examples/mask_rcnn.py @@ -0,0 +1,299 @@ +from models.mask_rcnn import MaskRCNN +from models.resnet import ResNet +from models.mask_rcnn import BoxList +from torch.nn import functional as F +from torchvision import transforms as T +from torchvision.transforms import functional as Ft +import random +from tinygrad.tensor import Tensor +from PIL import Image +import numpy as np +import torch +import argparse +import cv2 + + +class Resize: + def __init__(self, min_size, max_size): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def __call__(self, image): + size = self.get_size(image.size) + image = Ft.resize(image, size) + return image + + +class Normalize: + def __init__(self, mean, std, to_bgr255=True): + self.mean = mean + self.std = std + self.to_bgr255 = to_bgr255 + + def __call__(self, image): + if self.to_bgr255: + image = image[[2, 1, 0]] * 255 + else: + image = image[[0, 1, 2]] * 255 + image = Ft.normalize(image, mean=self.mean, std=self.std) + return image + +transforms = lambda size_scale: T.Compose( + [ + Resize(int(800*size_scale), int(1333*size_scale)), + T.ToTensor(), + Normalize( + mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.], to_bgr255=True + ), + ] +) + +def expand_boxes(boxes, scale): + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_masks(mask, padding): + N = mask.shape[0] + M = mask.shape[-1] + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) + padded_mask[:, :, padding:-padding, padding:-padding] = mask + return padded_mask, scale + + +def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): + # TODO: remove torch + mask = torch.tensor(mask.numpy()) + box = torch.tensor(box.numpy()) + padded_mask, scale = expand_masks(mask[None], padding=padding) + mask = padded_mask[0, 0] + box = expand_boxes(box[None], scale)[0] + box = box.to(dtype=torch.int32) + + TO_REMOVE = 1 + w = int(box[2] - box[0] + TO_REMOVE) + h = int(box[3] - box[1] + TO_REMOVE) + w = max(w, 1) + h = max(h, 1) + + mask = mask.expand((1, 1, -1, -1)) + + mask = mask.to(torch.float32) + mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) + mask = mask[0][0] + + if thresh >= 0: + mask = mask > thresh + else: + mask = (mask * 255).to(torch.uint8) + + im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0]) + ] + return im_mask + + +class Masker: + def __init__(self, threshold=0.5, padding=1): + self.threshold = threshold + self.padding = padding + + def forward_single_image(self, masks, boxes): + boxes = boxes.convert("xyxy") + im_w, im_h = boxes.size + res = [ + paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) + for mask, box in zip(masks, boxes.bbox) + ] + if len(res) > 0: + res = torch.stack(res, dim=0)[:, None] + else: + res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) + return Tensor(res.numpy()) + + def __call__(self, masks, boxes): + if isinstance(boxes, BoxList): + boxes = [boxes] + + results = [] + for mask, box in zip(masks, boxes): + result = self.forward_single_image(mask, box) + results.append(result) + return results + + +masker = Masker(threshold=0.5, padding=1) + +def select_top_predictions(predictions, confidence_threshold=0.9): + scores = predictions.get_field("scores").numpy() + keep = [idx for idx, score in enumerate(scores) if score > confidence_threshold] + return predictions[keep] + +def compute_prediction(original_image, model, confidence_threshold, size_scale=1.0): + image = transforms(size_scale)(original_image).numpy() + image = Tensor(image, requires_grad=False) + predictions = model(image) + prediction = predictions[0] + prediction = select_top_predictions(prediction, confidence_threshold) + width, height = original_image.size + prediction = prediction.resize((width, height)) + + if prediction.has_field("mask"): + masks = prediction.get_field("mask") + masks = masker([masks], [prediction])[0] + prediction.add_field("mask", masks) + return prediction + +def compute_prediction_batched(batch, model, size_scale=1.0): + imgs = [] + for img in batch: + imgs.append(transforms(size_scale)(img).numpy()) + image = [Tensor(image, requires_grad=False) for image in imgs] + predictions = model(image) + del image + return predictions + +palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) + +def findContours(*args, **kwargs): + if cv2.__version__.startswith('4'): + contours, hierarchy = cv2.findContours(*args, **kwargs) + elif cv2.__version__.startswith('3'): + _, contours, hierarchy = cv2.findContours(*args, **kwargs) + return contours, hierarchy + +def compute_colors_for_labels(labels): + l = labels[:, None] + colors = l * palette + colors = (colors % 255).astype("uint8") + return colors + +def overlay_mask(image, predictions): + image = np.asarray(image) + masks = predictions.get_field("mask").numpy() + labels = predictions.get_field("labels").numpy() + + colors = compute_colors_for_labels(labels).tolist() + + for mask, color in zip(masks, colors): + thresh = mask[0, :, :, None] + contours, hierarchy = findContours( + thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE + ) + image = cv2.drawContours(image, contours, -1, color, 3) + + composite = image + + return composite + +CATEGORIES = [ + "__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", + "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", + "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", + "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", + "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", + "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", + "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", +] + +def overlay_boxes(image, predictions): + labels = predictions.get_field("labels").numpy() + boxes = predictions.bbox + image = np.asarray(image) + colors = compute_colors_for_labels(labels).tolist() + + for box, color in zip(boxes, colors): + box = torch.tensor(box.numpy()) + box = box.to(torch.int64) + top_left, bottom_right = box[:2].tolist(), box[2:].tolist() + image = cv2.rectangle( + image, tuple(top_left), tuple(bottom_right), tuple(color), 1 + ) + + return image + +def overlay_class_names(image, predictions): + scores = predictions.get_field("scores").numpy().tolist() + labels = predictions.get_field("labels").numpy().tolist() + labels = [CATEGORIES[int(i)] for i in labels] + boxes = predictions.bbox.numpy() + image = np.asarray(image) + template = "{}: {:.2f}" + for box, score, label in zip(boxes, scores, labels): + x, y = box[:2] + s = template.format(label, score) + x, y = int(x), int(y) + cv2.putText( + image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 + ) + + return image + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run MaskRCNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--image', type=str, help="Path of the image to run") + parser.add_argument('--threshold', type=float, default=0.7, help="Detector threshold") + parser.add_argument('--size_scale', type=float, default=1.0, help="Image resize multiplier") + parser.add_argument('--out', type=str, default="/tmp/rendered.png", help="Output filename") + args = parser.parse_args() + + resnet = ResNet(50, num_classes=None, stride_in_1x1=True) + model_tiny = MaskRCNN(resnet) + model_tiny.load_from_pretrained() + img = Image.open(args.image) + top_result_tiny = compute_prediction(img, model_tiny, confidence_threshold=args.threshold, size_scale=args.size_scale) + bbox_image = overlay_boxes(img, top_result_tiny) + mask_image = overlay_mask(bbox_image, top_result_tiny) + final_image = overlay_class_names(mask_image, top_result_tiny) + + im = Image.fromarray(final_image) + print(f"saving {args.out}") + im.save(args.out) + im.show() diff --git a/examples/mlperf/model_eval.py b/examples/mlperf/model_eval.py index 0c6b9254..ab72f1eb 100644 --- a/examples/mlperf/model_eval.py +++ b/examples/mlperf/model_eval.py @@ -184,14 +184,51 @@ def eval_bert(): st = time.perf_counter() +def eval_mrcnn(): + from tqdm import tqdm + from models.mask_rcnn import MaskRCNN + from models.resnet import ResNet + from datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate + from examples.mask_rcnn import compute_prediction_batched, Image + mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True)) + mdl.load_from_pretrained() + + bbox_output = '/tmp/results_bbox.json' + mask_output = '/tmp/results_mask.json' + + accumulate_predictions_for_coco([], bbox_output, rm=True) + accumulate_predictions_for_coco([], mask_output, rm=True) + + #TODO: bs > 1 not as accurate + bs = 1 + + for batch in tqdm(iterate(images, bs=bs), total=len(images)//bs): + batch_imgs = [] + for image_row in batch: + image_name = image_row['file_name'] + img = Image.open(BASEDIR/f'val2017/{image_name}').convert("RGB") + batch_imgs.append(img) + batch_result = compute_prediction_batched(batch_imgs, mdl) + for image_row, result in zip(batch, batch_result): + image_name = image_row['file_name'] + box_pred = convert_prediction_to_coco_bbox(image_name, result) + mask_pred = convert_prediction_to_coco_mask(image_name, result) + accumulate_predictions_for_coco(box_pred, bbox_output) + accumulate_predictions_for_coco(mask_pred, mask_output) + del batch_imgs + del batch_result + + evaluate_predictions_on_coco(bbox_output, iou_type='bbox') + evaluate_predictions_on_coco(mask_output, iou_type='segm') + if __name__ == "__main__": # inference only Tensor.training = False Tensor.no_grad = True - models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(",") + models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",") for m in models: nm = f"eval_{m}" if nm in globals(): print(f"eval {m}") - globals()[nm]() + globals()[nm]() \ No newline at end of file diff --git a/examples/mlperf/model_spec.py b/examples/mlperf/model_spec.py index 69ff7cae..9d885724 100644 --- a/examples/mlperf/model_spec.py +++ b/examples/mlperf/model_spec.py @@ -5,7 +5,8 @@ import numpy as np def test_model(model, *inputs): GlobalCounters.reset() - model(*inputs).numpy() + out = model(*inputs) + if isinstance(out, Tensor): out = out.numpy() # TODO: return event future to still get the time_sum_s without DEBUG=2 print(f"{GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.time_sum_s*1000:.2f} ms") @@ -49,15 +50,21 @@ def spec_bert(): tt = Tensor(np.random.randint(0, 2, (1, 384)).astype(np.float32)) test_model(mdl, x, am, tt) +def spec_mrcnn(): + from models.mask_rcnn import MaskRCNN, ResNet + mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True)) + mdl.load_from_pretrained() + x = Tensor.randn(3, 224, 224) + test_model(mdl, [x]) + if __name__ == "__main__": # inference only for now Tensor.training = False Tensor.no_grad = True - for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(","): + for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(","): nm = f"spec_{m}" if nm in globals(): print(f"testing {m}") globals()[nm]() - diff --git a/models/mask_rcnn.py b/models/mask_rcnn.py new file mode 100644 index 00000000..5f32b828 --- /dev/null +++ b/models/mask_rcnn.py @@ -0,0 +1,1273 @@ +import re +import math +import os +import numpy as np +from pathlib import Path +from tinygrad import nn +from tinygrad.tensor import Tensor +from tinygrad.helpers import dtypes +from extra.utils import get_child, download_file +from tinygrad.state import torch_load +from models.resnet import ResNet +from models.retinanet import nms as _box_nms + + +USE_NP_GATHER = os.getenv('FULL_TINYGRAD', '0') == '0' + +def rint(tensor): + x = (tensor*2).cast(dtypes.int32).contiguous().cast(dtypes.float32)/2 + return (x<0).where(x.floor(), x.ceil()) + +def nearest_interpolate(tensor, scale_factor): + bs, c, py, px = tensor.shape + return tensor.reshape(bs, c, py, 1, px, 1).expand(bs, c, py, scale_factor, px, scale_factor).reshape(bs, c, py * scale_factor, px * scale_factor) + +def meshgrid(x, y): + grid_x = Tensor.cat(*[x[idx:idx+1].expand(y.shape).unsqueeze(0) for idx in range(x.shape[0])]) + grid_y = Tensor.cat(*[y.unsqueeze(0)]*x.shape[0]) + return grid_x.reshape(-1, 1), grid_y.reshape(-1, 1) + +def topk(input_, k, dim=-1, largest=True, sorted=False): + k = min(k, input_.shape[dim]-1) + input_ = input_.numpy() + if largest: input_ *= -1 + ind = np.argpartition(input_, k, axis=dim) + if largest: input_ *= -1 + ind = np.take(ind, np.arange(k), axis=dim) # k non-sorted indices + input_ = np.take_along_axis(input_, ind, axis=dim) # k non-sorted values + if not sorted: return Tensor(input_), ind + if largest: input_ *= -1 + ind_part = np.argsort(input_, axis=dim) + ind = np.take_along_axis(ind, ind_part, axis=dim) + if largest: input_ *= -1 + val = np.take_along_axis(input_, ind_part, axis=dim) + return Tensor(val), ind + +# This is very slow for large arrays, or indices +def _gather(array, indices): + indices = indices.float().to(array.device) + reshape_arg = [1]*array.ndim + [array.shape[-1]] + return Tensor.where( + indices.unsqueeze(indices.ndim).expand(*indices.shape, array.shape[-1]) == Tensor.arange(array.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, array.shape[-1]), + array, 0, + ).sum(indices.ndim) + +# TODO: replace npgather with a faster gather using tinygrad only +# NOTE: this blocks the gradient +def npgather(array,indices): + if isinstance(array, Tensor): array = array.numpy() + if isinstance(indices, Tensor): indices = indices.numpy() + if isinstance(indices, list): indices = np.asarray(indices) + return Tensor(array[indices.astype(int)]) + +def get_strides(shape): + prod = [1] + for idx in range(len(shape)-1, -1, -1): prod.append(prod[-1] * shape[idx]) + # something about ints is broken with gpu, cuda + return Tensor(prod[::-1][1:], dtype=dtypes.int32).unsqueeze(0).cpu() + +# with keys as integer array for all axes +def tensor_getitem(tensor, *keys): + # something about ints is broken with gpu, cuda + flat_keys = Tensor.stack([key.expand((sum(keys)).shape).reshape(-1) for key in keys], dim=1).cpu().cast(dtypes.int32) + strides = get_strides(tensor.shape) + idxs = (flat_keys * strides).sum(1) + gatherer = npgather if USE_NP_GATHER else _gather + return gatherer(tensor.reshape(-1), idxs).reshape(sum(keys).shape) + + +# for gather with indicies only on axis=0 +def tensor_gather(tensor, indices): + if not isinstance(indices, Tensor): + indices = Tensor(indices, requires_grad=False) + if len(tensor.shape) > 2: + rem_shape = list(tensor.shape)[1:] + tensor = tensor.reshape(tensor.shape[0], -1) + else: + rem_shape = None + if len(tensor.shape) > 1: + tensor = tensor.T + repeat_arg = [1]*(tensor.ndim-1) + [tensor.shape[-2]] + indices = indices.unsqueeze(indices.ndim).repeat(repeat_arg) + ret = _gather(tensor, indices) + if rem_shape: + ret = ret.reshape([indices.shape[0]] + rem_shape) + else: + ret = _gather(tensor, indices) + del indices + return ret + + +class LastLevelMaxPool: + def __call__(self, x): return [Tensor.max_pool2d(x, 1, 2)] + + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +def permute_and_flatten(layer:Tensor, N, A, C, H, W): + layer = layer.reshape(N, -1, C, H, W) + layer = layer.permute(0, 3, 4, 1, 2) + layer = layer.reshape(N, -1, C) + return layer + + +class BoxList: + def __init__(self, bbox, image_size, mode="xyxy"): + if not isinstance(bbox, Tensor): + bbox = Tensor(bbox) + if bbox.ndim != 2: + raise ValueError( + "bbox should have 2 dimensions, got {}".format(bbox.ndim) + ) + if bbox.shape[-1] != 4: + raise ValueError( + "last dimenion of bbox should have a " + "size of 4, got {}".format(bbox.shape[-1]) + ) + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + + self.bbox = bbox + self.size = image_size # (image_width, image_height) + self.mode = mode + self.extra_fields = {} + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_boxes={}, ".format(len(self)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + def area(self): + box = self.bbox + if self.mode == "xyxy": + TO_REMOVE = 1 + area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) + elif self.mode == "xywh": + area = box[:, 2] * box[:, 3] + return area + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def has_field(self, field): + return field in self.extra_fields + + def fields(self): + return list(self.extra_fields.keys()) + + def _copy_extra_fields(self, bbox): + for k, v in bbox.extra_fields.items(): + self.extra_fields[k] = v + + def convert(self, mode): + if mode == self.mode: + return self + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if mode == "xyxy": + bbox = Tensor.cat(*(xmin, ymin, xmax, ymax), dim=-1) + bbox = BoxList(bbox, self.size, mode=mode) + else: + TO_REMOVE = 1 + bbox = Tensor.cat( + *(xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 + ) + bbox = BoxList(bbox, self.size, mode=mode) + bbox._copy_extra_fields(self) + return bbox + + def _split_into_xyxy(self): + if self.mode == "xyxy": + xmin, ymin, xmax, ymax = self.bbox.chunk(4, dim=-1) + return xmin, ymin, xmax, ymax + elif self.mode == "xywh": + TO_REMOVE = 1 + xmin, ymin, w, h = self.bbox.chunk(4, dim=-1) + return ( + xmin, + ymin, + xmin + (w - TO_REMOVE).clamp(min=0), + ymin + (h - TO_REMOVE).clamp(min=0), + ) + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_box = self.bbox * ratio + bbox = BoxList(scaled_box, size, mode=self.mode) + for k, v in self.extra_fields.items(): + if not isinstance(v, Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + return bbox + + ratio_width, ratio_height = ratios + xmin, ymin, xmax, ymax = self._split_into_xyxy() + scaled_xmin = xmin * ratio_width + scaled_xmax = xmax * ratio_width + scaled_ymin = ymin * ratio_height + scaled_ymax = ymax * ratio_height + scaled_box = Tensor.cat( + *(scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 + ) + bbox = BoxList(scaled_box, size, mode="xyxy") + for k, v in self.extra_fields.items(): + if not isinstance(v, Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + + return bbox.convert(self.mode) + + def transpose(self, method): + image_width, image_height = self.size + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if method == FLIP_LEFT_RIGHT: + TO_REMOVE = 1 + transposed_xmin = image_width - xmax - TO_REMOVE + transposed_xmax = image_width - xmin - TO_REMOVE + transposed_ymin = ymin + transposed_ymax = ymax + elif method == FLIP_TOP_BOTTOM: + transposed_xmin = xmin + transposed_xmax = xmax + transposed_ymin = image_height - ymax + transposed_ymax = image_height - ymin + + transposed_boxes = Tensor.cat( + *(transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 + ) + bbox = BoxList(transposed_boxes, self.size, mode="xyxy") + for k, v in self.extra_fields.items(): + if not isinstance(v, Tensor): + v = v.transpose(method) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + def clip_to_image(self, remove_empty=True): + TO_REMOVE = 1 + bb1 = self.bbox.clip(min_=0, max_=self.size[0] - TO_REMOVE)[:, 0] + bb2 = self.bbox.clip(min_=0, max_=self.size[1] - TO_REMOVE)[:, 1] + bb3 = self.bbox.clip(min_=0, max_=self.size[0] - TO_REMOVE)[:, 2] + bb4 = self.bbox.clip(min_=0, max_=self.size[1] - TO_REMOVE)[:, 3] + self.bbox = Tensor.stack((bb1, bb2, bb3, bb4), dim=1) + if remove_empty: + box = self.bbox + keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) + return self[keep] + return self + + def __getitem__(self, item): + if isinstance(item, list): + if len(item) == 0: + return [] + if sum(item) == len(item) and isinstance(item[0], bool): + return self + bbox = BoxList(tensor_gather(self.bbox, item), self.size, self.mode) + for k, v in self.extra_fields.items(): + bbox.add_field(k, tensor_gather(v, item)) + return bbox + + def __len__(self): + return self.bbox.shape[0] + + +def cat_boxlist(bboxes): + size = bboxes[0].size + mode = bboxes[0].mode + fields = set(bboxes[0].fields()) + cat_box_list = [bbox.bbox for bbox in bboxes if bbox.bbox.shape[0] > 0] + + if len(cat_box_list) > 0: + cat_boxes = BoxList(Tensor.cat(*cat_box_list, dim=0), size, mode) + else: + cat_boxes = BoxList(bboxes[0].bbox, size, mode) + for field in fields: + cat_field_list = [bbox.get_field(field) for bbox in bboxes if bbox.get_field(field).shape[0] > 0] + + if len(cat_box_list) > 0: + data = Tensor.cat(*cat_field_list, dim=0) + else: + data = bboxes[0].get_field(field) + + cat_boxes.add_field(field, data) + + return cat_boxes + + +class FPN: + def __init__(self, in_channels_list, out_channels): + self.inner_blocks, self.layer_blocks = [], [] + for in_channels in in_channels_list: + self.inner_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=1)) + self.layer_blocks.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)) + self.top_block = LastLevelMaxPool() + + def __call__(self, x: Tensor): + last_inner = self.inner_blocks[-1](x[-1]) + results = [] + results.append(self.layer_blocks[-1](last_inner)) + for feature, inner_block, layer_block in zip( + x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] + ): + if not inner_block: + continue + inner_top_down = nearest_interpolate(last_inner, scale_factor=2) + inner_lateral = inner_block(feature) + last_inner = inner_lateral + inner_top_down + layer_result = layer_block(last_inner) + results.insert(0, layer_result) + last_results = self.top_block(results[-1]) + results.extend(last_results) + + return tuple(results) + + +class ResNetFPN: + def __init__(self, resnet, out_channels=256): + self.out_channels = out_channels + self.body = resnet + in_channels_stage2 = 256 + in_channels_list = [ + in_channels_stage2, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ] + self.fpn = FPN(in_channels_list, out_channels) + + def __call__(self, x): + x = self.body(x) + return self.fpn(x) + + +class AnchorGenerator: + def __init__( + self, + sizes=(32, 64, 128, 256, 512), + aspect_ratios=(0.5, 1.0, 2.0), + anchor_strides=(4, 8, 16, 32, 64), + straddle_thresh=0, + ): + if len(anchor_strides) == 1: + anchor_stride = anchor_strides[0] + cell_anchors = [ + generate_anchors(anchor_stride, sizes, aspect_ratios) + ] + else: + if len(anchor_strides) != len(sizes): + raise RuntimeError("FPN should have #anchor_strides == #sizes") + + cell_anchors = [ + generate_anchors( + anchor_stride, + size if isinstance(size, (tuple, list)) else (size,), + aspect_ratios + ) + for anchor_stride, size in zip(anchor_strides, sizes) + ] + self.strides = anchor_strides + self.cell_anchors = cell_anchors + self.straddle_thresh = straddle_thresh + + def num_anchors_per_location(self): + return [cell_anchors.shape[0] for cell_anchors in self.cell_anchors] + + def grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip( + grid_sizes, self.strides, self.cell_anchors + ): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = Tensor.arange( + start=0, stop=grid_width * stride, step=stride, dtype=dtypes.float32, device=device + ) + shifts_y = Tensor.arange( + start=0, stop=grid_height * stride, step=stride, dtype=dtypes.float32, device=device + ) + shift_y, shift_x = meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + shifts = Tensor.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + + anchors.append( + (shifts.reshape(-1, 1, 4) + base_anchors.reshape(1, -1, 4)).reshape(-1, 4) + ) + + return anchors + + def add_visibility_to(self, boxlist): + image_width, image_height = boxlist.size + anchors = boxlist.bbox + if self.straddle_thresh >= 0: + inds_inside = ( + (anchors[:, 0] >= -self.straddle_thresh) + * (anchors[:, 1] >= -self.straddle_thresh) + * (anchors[:, 2] < image_width + self.straddle_thresh) + * (anchors[:, 3] < image_height + self.straddle_thresh) + ) + else: + device = anchors.device + inds_inside = Tensor.ones(anchors.shape[0], dtype=dtypes.uint8, device=device) + boxlist.add_field("visibility", inds_inside) + + def __call__(self, image_list, feature_maps): + grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] + anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) + anchors = [] + for (image_height, image_width) in image_list.image_sizes: + anchors_in_image = [] + for anchors_per_feature_map in anchors_over_all_feature_maps: + boxlist = BoxList( + anchors_per_feature_map, (image_width, image_height), mode="xyxy" + ) + self.add_visibility_to(boxlist) + anchors_in_image.append(boxlist) + anchors.append(anchors_in_image) + return anchors + + +def generate_anchors( + stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) +): + return _generate_anchors(stride, Tensor(list(sizes)) / stride, Tensor(list(aspect_ratios))) + + +def _generate_anchors(base_size, scales, aspect_ratios): + anchor = Tensor([1, 1, base_size, base_size]) - 1 + anchors = _ratio_enum(anchor, aspect_ratios) + anchors = Tensor.cat( + *[_scale_enum(anchors[i, :], scales).reshape(-1, 4) for i in range(anchors.shape[0])] + ) + return anchors + + +def _whctrs(anchor): + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + +def _mkanchors(ws, hs, x_ctr, y_ctr): + ws = ws[:, None] + hs = hs[:, None] + anchors = Tensor.cat(*( + x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1), + ), dim=1) + return anchors + + +def _ratio_enum(anchor, ratios): + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = rint(Tensor.sqrt(size_ratios)) + hs = rint(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +def _scale_enum(anchor, scales): + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +class RPNHead: + def __init__(self, in_channels, num_anchors): + self.conv = nn.Conv2d(in_channels, 256, kernel_size=3, padding=1) + self.cls_logits = nn.Conv2d(256, num_anchors, kernel_size=1) + self.bbox_pred = nn.Conv2d(256, num_anchors * 4, kernel_size=1) + + def __call__(self, x): + logits = [] + bbox_reg = [] + for feature in x: + t = Tensor.relu(self.conv(feature)) + logits.append(self.cls_logits(t)) + bbox_reg.append(self.bbox_pred(t)) + return logits, bbox_reg + + +class BoxCoder(object): + def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): + self.weights = weights + self.bbox_xform_clip = bbox_xform_clip + + def encode(self, reference_boxes, proposals): + TO_REMOVE = 1 # TODO remove + ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE + ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE + ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths + ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights + + gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE + gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE + gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = self.weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * Tensor.log(gt_widths / ex_widths) + targets_dh = wh * Tensor.log(gt_heights / ex_heights) + + targets = Tensor.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + def decode(self, rel_codes, boxes): + boxes = boxes.cast(rel_codes.dtype) + rel_codes = rel_codes + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = rel_codes[:, 0::4] / wx + dy = rel_codes[:, 1::4] / wy + dw = rel_codes[:, 2::4] / ww + dh = rel_codes[:, 3::4] / wh + + # Prevent sending too large values into Tensor.exp() + dw = dw.clip(min_=dw.min(), max_=self.bbox_xform_clip) + dh = dh.clip(min_=dh.min(), max_=self.bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = dw.exp() * widths[:, None] + pred_h = dh.exp() * heights[:, None] + x = pred_ctr_x - 0.5 * pred_w + y = pred_ctr_y - 0.5 * pred_h + w = pred_ctr_x + 0.5 * pred_w - 1 + h = pred_ctr_y + 0.5 * pred_h - 1 + pred_boxes = Tensor.stack([x, y, w, h]).permute(1,2,0).reshape(rel_codes.shape[0], rel_codes.shape[1]) + return pred_boxes + + +def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): + if nms_thresh <= 0: + return boxlist + mode = boxlist.mode + boxlist = boxlist.convert("xyxy") + boxes = boxlist.bbox + score = boxlist.get_field(score_field) + keep = _box_nms(boxes.numpy(), score.numpy(), nms_thresh) + if max_proposals > 0: + keep = keep[:max_proposals] + boxlist = boxlist[keep] + return boxlist.convert(mode) + + +def remove_small_boxes(boxlist, min_size): + xywh_boxes = boxlist.convert("xywh").bbox + _, _, ws, hs = xywh_boxes.chunk(4, dim=1) + keep = (( + (ws >= min_size) * (hs >= min_size) + ) > 0).reshape(-1) + if keep.sum().numpy() == len(boxlist): + return boxlist + else: + keep = keep.numpy().nonzero()[0] + return boxlist[keep] + + +class RPNPostProcessor: + # Not used in Loss calculation + def __init__( + self, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + box_coder=None, + fpn_post_nms_top_n=None, + ): + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + + if box_coder is None: + box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + self.box_coder = box_coder + + if fpn_post_nms_top_n is None: + fpn_post_nms_top_n = post_nms_top_n + self.fpn_post_nms_top_n = fpn_post_nms_top_n + + def forward_for_single_feature_map(self, anchors, objectness, box_regression): + device = objectness.device + N, A, H, W = objectness.shape + objectness = permute_and_flatten(objectness, N, A, 1, H, W).reshape(N, -1) + objectness = objectness.sigmoid() + + box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) + + num_anchors = A * H * W + + pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) + objectness, topk_idx = topk(objectness, pre_nms_top_n, dim=1, sorted=False) + concat_anchors = Tensor.cat(*[a.bbox for a in anchors], dim=0).reshape(N, -1, 4) + image_shapes = [box.size for box in anchors] + + box_regression_list = [] + concat_anchors_list = [] + for batch_idx in range(N): + box_regression_list.append(tensor_gather(box_regression[batch_idx], topk_idx[batch_idx])) + concat_anchors_list.append(tensor_gather(concat_anchors[batch_idx], topk_idx[batch_idx])) + + box_regression = Tensor.stack(box_regression_list) + concat_anchors = Tensor.stack(concat_anchors_list) + + proposals = self.box_coder.decode( + box_regression.reshape(-1, 4), concat_anchors.reshape(-1, 4) + ) + + proposals = proposals.reshape(N, -1, 4) + + result = [] + for proposal, score, im_shape in zip(proposals, objectness, image_shapes): + boxlist = BoxList(proposal, im_shape, mode="xyxy") + boxlist.add_field("objectness", score) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + boxlist = boxlist_nms( + boxlist, + self.nms_thresh, + max_proposals=self.post_nms_top_n, + score_field="objectness", + ) + result.append(boxlist) + return result + + def __call__(self, anchors, objectness, box_regression): + sampled_boxes = [] + num_levels = len(objectness) + anchors = list(zip(*anchors)) + for a, o, b in zip(anchors, objectness, box_regression): + sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) + + boxlists = list(zip(*sampled_boxes)) + boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + + if num_levels > 1: + boxlists = self.select_over_all_levels(boxlists) + + return boxlists + + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + for i in range(num_images): + objectness = boxlists[i].get_field("objectness") + post_nms_top_n = min(self.fpn_post_nms_top_n, objectness.shape[0]) + _, inds_sorted = topk(objectness, + post_nms_top_n, dim=0, sorted=False + ) + boxlists[i] = boxlists[i][inds_sorted] + return boxlists + + +class RPN: + def __init__(self, in_channels): + self.anchor_generator = AnchorGenerator() + + in_channels = 256 + head = RPNHead( + in_channels, self.anchor_generator.num_anchors_per_location()[0] + ) + rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + box_selector_test = RPNPostProcessor( + pre_nms_top_n=1000, + post_nms_top_n=1000, + nms_thresh=0.7, + min_size=0, + box_coder=rpn_box_coder, + fpn_post_nms_top_n=1000 + ) + self.head = head + self.box_selector_test = box_selector_test + + def __call__(self, images, features, targets=None): + objectness, rpn_box_regression = self.head(features) + anchors = self.anchor_generator(images, features) + boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) + return boxes, {} + + +def make_conv3x3( + in_channels, + out_channels, + dilation=1, + stride=1, + use_gn=False, +): + conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False if use_gn else True + ) + return conv + + +class MaskRCNNFPNFeatureExtractor: + def __init__(self): + resolution = 14 + scales = (0.25, 0.125, 0.0625, 0.03125) + sampling_ratio = 2 + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = 256 + self.pooler = pooler + + use_gn = False + layers = (256, 256, 256, 256) + dilation = 1 + self.mask_fcn1 = make_conv3x3(input_size, layers[0], dilation=dilation, stride=1, use_gn=use_gn) + self.mask_fcn2 = make_conv3x3(layers[0], layers[1], dilation=dilation, stride=1, use_gn=use_gn) + self.mask_fcn3 = make_conv3x3(layers[1], layers[2], dilation=dilation, stride=1, use_gn=use_gn) + self.mask_fcn4 = make_conv3x3(layers[2], layers[3], dilation=dilation, stride=1, use_gn=use_gn) + self.blocks = [self.mask_fcn1, self.mask_fcn2, self.mask_fcn3, self.mask_fcn4] + + def __call__(self, x, proposals): + x = self.pooler(x, proposals) + for layer in self.blocks: + if x is not None: + x = Tensor.relu(layer(x)) + return x + + +class MaskRCNNC4Predictor: + def __init__(self): + num_classes = 81 + dim_reduced = 256 + num_inputs = dim_reduced + self.conv5_mask = nn.ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) + self.mask_fcn_logits = nn.Conv2d(dim_reduced, num_classes, 1, 1, 0) + + def __call__(self, x): + x = Tensor.relu(self.conv5_mask(x)) + return self.mask_fcn_logits(x) + + +class FPN2MLPFeatureExtractor: + def __init__(self, cfg): + resolution = 7 + scales = (0.25, 0.125, 0.0625, 0.03125) + sampling_ratio = 2 + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = 256 * resolution ** 2 + representation_size = 1024 + self.pooler = pooler + self.fc6 = nn.Linear(input_size, representation_size) + self.fc7 = nn.Linear(representation_size, representation_size) + + def __call__(self, x, proposals): + x = self.pooler(x, proposals) + x = x.reshape(x.shape[0], -1) + x = Tensor.relu(self.fc6(x)) + x = Tensor.relu(self.fc7(x)) + return x + + +def _bilinear_interpolate( + input, # [N, C, H, W] + roi_batch_ind, # [K] + y, # [K, PH, IY] + x, # [K, PW, IX] + ymask, # [K, IY] + xmask, # [K, IX] +): + _, channels, height, width = input.shape + y = y.clip(min_=0.0, max_=float(height-1)) + x = x.clip(min_=0.0, max_=float(width-1)) + + # Tensor.where doesnt work well with int32 data so cast to float32 + y_low = y.cast(dtypes.int32).contiguous().float().contiguous() + x_low = x.cast(dtypes.int32).contiguous().float().contiguous() + + y_high = Tensor.where(y_low >= height - 1, float(height - 1), y_low + 1) + y_low = Tensor.where(y_low >= height - 1, float(height - 1), y_low) + + x_high = Tensor.where(x_low >= width - 1, float(width - 1), x_low + 1) + x_low = Tensor.where(x_low >= width - 1, float(width - 1), x_low) + + ly = y - y_low + lx = x - x_low + hy = 1.0 - ly + hx = 1.0 - lx + + def masked_index( + y, # [K, PH, IY] + x, # [K, PW, IX] + ): + if ymask is not None: + assert xmask is not None + y = Tensor.where(ymask[:, None, :], y, 0) + x = Tensor.where(xmask[:, None, :], x, 0) + key1 = roi_batch_ind[:, None, None, None, None, None] + key2 = Tensor.arange(channels, device=input.device)[None, :, None, None, None, None] + key3 = y[:, None, :, None, :, None] + key4 = x[:, None, None, :, None, :] + return tensor_getitem(input,key1,key2,key3,key4) # [K, C, PH, PW, IY, IX] + + v1 = masked_index(y_low, x_low) + v2 = masked_index(y_low, x_high) + v3 = masked_index(y_high, x_low) + v4 = masked_index(y_high, x_high) + + # all ws preemptively [K, C, PH, PW, IY, IX] + def outer_prod(y, x): + return y[:, None, :, None, :, None] * x[:, None, None, :, None, :] + + w1 = outer_prod(hy, hx) + w2 = outer_prod(hy, lx) + w3 = outer_prod(ly, hx) + w4 = outer_prod(ly, lx) + + val = w1*v1 + w2*v2 + w3*v3 + w4*v4 + return val + +#https://pytorch.org/vision/main/_modules/torchvision/ops/roi_align.html#roi_align +def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): + orig_dtype = input.dtype + _, _, height, width = input.shape + ph = Tensor.arange(pooled_height, device=input.device) + pw = Tensor.arange(pooled_width, device=input.device) + + roi_batch_ind = rois[:, 0].cast(dtypes.int32).contiguous() + offset = 0.5 if aligned else 0.0 + roi_start_w = rois[:, 1] * spatial_scale - offset + roi_start_h = rois[:, 2] * spatial_scale - offset + roi_end_w = rois[:, 3] * spatial_scale - offset + roi_end_h = rois[:, 4] * spatial_scale - offset + + roi_width = roi_end_w - roi_start_w + roi_height = roi_end_h - roi_start_h + if not aligned: + roi_width = roi_width.maximum(1.0) + roi_height = roi_height.maximum(1.0) + + bin_size_h = roi_height / pooled_height + bin_size_w = roi_width / pooled_width + + exact_sampling = sampling_ratio > 0 + roi_bin_grid_h = sampling_ratio if exact_sampling else (roi_height / pooled_height).ceil() + roi_bin_grid_w = sampling_ratio if exact_sampling else (roi_width / pooled_width).ceil() + + if exact_sampling: + count = max(roi_bin_grid_h * roi_bin_grid_w, 1) + iy = Tensor.arange(roi_bin_grid_h, device=input.device) + ix = Tensor.arange(roi_bin_grid_w, device=input.device) + ymask = None + xmask = None + else: + count = (roi_bin_grid_h * roi_bin_grid_w).maximum(1) + iy = Tensor.arange(height, device=input.device) + ix = Tensor.arange(width, device=input.device) + ymask = iy[None, :] < roi_bin_grid_h[:, None] + xmask = ix[None, :] < roi_bin_grid_w[:, None] + + def from_K(t): + return t[:, None, None] + + y = ( + from_K(roi_start_h) + + ph[None, :, None] * from_K(bin_size_h) + + (iy[None, None, :] + 0.5) * from_K(bin_size_h / roi_bin_grid_h) + ) + x = ( + from_K(roi_start_w) + + pw[None, :, None] * from_K(bin_size_w) + + (ix[None, None, :] + 0.5) * from_K(bin_size_w / roi_bin_grid_w) + ) + + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) + if not exact_sampling: + val = ymask[:, None, None, None, :, None].where(val, 0) + val = xmask[:, None, None, None, None, :].where(val, 0) + + output = val.sum((-1, -2)) + if isinstance(count, Tensor): + output /= count[:, None, None, None] + else: + output /= count + + output = output.cast(orig_dtype) + return output + +class ROIAlign: + def __init__(self, output_size, spatial_scale, sampling_ratio): + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + def __call__(self, input, rois): + output = _roi_align( + input, rois, self.spatial_scale, self.output_size[0], self.output_size[1], self.sampling_ratio, aligned=False + ) + return output + + +class LevelMapper: + def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): + self.k_min = k_min + self.k_max = k_max + self.s0 = canonical_scale + self.lvl0 = canonical_level + self.eps = eps + + def __call__(self, boxlists): + s = Tensor.sqrt(Tensor.cat(*[boxlist.area() for boxlist in boxlists])) + target_lvls = (self.lvl0 + Tensor.log2(s / self.s0 + self.eps)).floor() + target_lvls = target_lvls.clip(min_=self.k_min, max_=self.k_max) + return target_lvls - self.k_min + + +class Pooler: + def __init__(self, output_size, scales, sampling_ratio): + self.output_size = output_size + self.scales = scales + self.sampling_ratio = sampling_ratio + poolers = [] + for scale in scales: + poolers.append( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio + ) + ) + self.poolers = poolers + self.output_size = output_size + lvl_min = -math.log2(scales[0]) + lvl_max = -math.log2(scales[-1]) + self.map_levels = LevelMapper(lvl_min, lvl_max) + + def convert_to_roi_format(self, boxes): + concat_boxes = Tensor.cat(*[b.bbox for b in boxes], dim=0) + device, dtype = concat_boxes.device, concat_boxes.dtype + ids = Tensor.cat( + *[ + Tensor.full((len(b), 1), i, dtype=dtype, device=device) + for i, b in enumerate(boxes) + ], + dim=0, + ) + if concat_boxes.shape[0] != 0: + rois = Tensor.cat(*[ids, concat_boxes], dim=1) + return rois + + def __call__(self, x, boxes): + num_levels = len(self.poolers) + rois = self.convert_to_roi_format(boxes) + if rois: + if num_levels == 1: + return self.poolers[0](x[0], rois) + + levels = self.map_levels(boxes) + results = [] + all_idxs = [] + for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): + # this is fine because no grad will flow through index + idx_in_level = (levels.numpy() == level).nonzero()[0] + if len(idx_in_level) > 0: + rois_per_level = tensor_gather(rois, idx_in_level) + pooler_output = pooler(per_level_feature, rois_per_level) + all_idxs.extend(idx_in_level) + results.append(pooler_output) + + return tensor_gather(Tensor.cat(*results), [x[0] for x in sorted({i:idx for i, idx in enumerate(all_idxs)}.items(), key=lambda x: x[1])]) + + +class FPNPredictor: + def __init__(self): + num_classes = 81 + representation_size = 1024 + self.cls_score = nn.Linear(representation_size, num_classes) + num_bbox_reg_classes = num_classes + self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) + + def __call__(self, x): + scores = self.cls_score(x) + bbox_deltas = self.bbox_pred(x) + return scores, bbox_deltas + + +class PostProcessor: + # Not used in training + def __init__( + self, + score_thresh=0.05, + nms=0.5, + detections_per_img=100, + box_coder=None, + cls_agnostic_bbox_reg=False + ): + self.score_thresh = score_thresh + self.nms = nms + self.detections_per_img = detections_per_img + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg + + def __call__(self, x, boxes): + class_logits, box_regression = x + class_prob = Tensor.softmax(class_logits, -1) + image_shapes = [box.size for box in boxes] + boxes_per_image = [len(box) for box in boxes] + concat_boxes = Tensor.cat(*[a.bbox for a in boxes], dim=0) + + if self.cls_agnostic_bbox_reg: + box_regression = box_regression[:, -4:] + proposals = self.box_coder.decode( + box_regression.reshape(sum(boxes_per_image), -1), concat_boxes + ) + if self.cls_agnostic_bbox_reg: + proposals = proposals.repeat([1, class_prob.shape[1]]) + num_classes = class_prob.shape[1] + proposals = proposals.unsqueeze(0) + class_prob = class_prob.unsqueeze(0) + results = [] + for prob, boxes_per_img, image_shape in zip( + class_prob, proposals, image_shapes + ): + boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = self.filter_results(boxlist, num_classes) + results.append(boxlist) + return results + + def prepare_boxlist(self, boxes, scores, image_shape): + boxes = boxes.reshape(-1, 4) + scores = scores.reshape(-1) + boxlist = BoxList(boxes, image_shape, mode="xyxy") + boxlist.add_field("scores", scores) + return boxlist + + def filter_results(self, boxlist, num_classes): + boxes = boxlist.bbox.reshape(-1, num_classes * 4) + scores = boxlist.get_field("scores").reshape(-1, num_classes) + + device = scores.device + result = [] + scores = scores.numpy() + boxes = boxes.numpy() + inds_all = scores > self.score_thresh + for j in range(1, num_classes): + inds = inds_all[:, j].nonzero()[0] + # This needs to be done in numpy because it can create empty arrays + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4: (j + 1) * 4] + boxes_j = Tensor(boxes_j) + scores_j = Tensor(scores_j) + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + if len(boxlist_for_class): + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", Tensor.full((num_labels,), j, device=device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + if number_of_detections > self.detections_per_img > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = topk(cls_scores, k=self.detections_per_img) + image_thresh = image_thresh.numpy()[-1] + keep = (cls_scores.numpy() >= image_thresh).nonzero()[0] + result = result[keep] + return result + + +class RoIBoxHead: + def __init__(self, in_channels): + self.feature_extractor = FPN2MLPFeatureExtractor(in_channels) + self.predictor = FPNPredictor() + self.post_processor = PostProcessor( + score_thresh=0.05, + nms=0.5, + detections_per_img=100, + box_coder=BoxCoder(weights=(10., 10., 5., 5.)), + cls_agnostic_bbox_reg=False + ) + + def __call__(self, features, proposals, targets=None): + x = self.feature_extractor(features, proposals) + class_logits, box_regression = self.predictor(x) + if not Tensor.training: + result = self.post_processor((class_logits, box_regression), proposals) + return x, result, {} + + +class MaskPostProcessor: + # Not used in loss calculation + def __call__(self, x, boxes): + mask_prob = x.sigmoid().numpy() + num_masks = x.shape[0] + labels = [bbox.get_field("labels") for bbox in boxes] + labels = Tensor.cat(*labels).numpy().astype(np.int32) + index = np.arange(num_masks) + mask_prob = mask_prob[index, labels][:, None] + boxes_per_image, cumsum = [], 0 + for box in boxes: + cumsum += len(box) + boxes_per_image.append(cumsum) + # using numpy here as Tensor.chunk doesnt have custom chunk sizes + mask_prob = np.split(mask_prob, boxes_per_image, axis=0) + results = [] + for prob, box in zip(mask_prob, boxes): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + prob = Tensor(prob) + bbox.add_field("mask", prob) + results.append(bbox) + + return results + + +class Mask: + def __init__(self): + self.feature_extractor = MaskRCNNFPNFeatureExtractor() + self.predictor = MaskRCNNC4Predictor() + self.post_processor = MaskPostProcessor() + + def __call__(self, features, proposals, targets=None): + x = self.feature_extractor(features, proposals) + if x: + mask_logits = self.predictor(x) + if not Tensor.training: + result = self.post_processor(mask_logits, proposals) + return x, result, {} + return x, [], {} + + +class RoIHeads: + def __init__(self, in_channels): + self.box = RoIBoxHead(in_channels) + self.mask = Mask() + + def __call__(self, features, proposals, targets=None): + x, detections, _ = self.box(features, proposals, targets) + x, detections, _ = self.mask(features, detections, targets) + return x, detections, {} + + +class ImageList(object): + def __init__(self, tensors, image_sizes): + self.tensors = tensors + self.image_sizes = image_sizes + + def to(self, *args, **kwargs): + cast_tensor = self.tensors.to(*args, **kwargs) + return ImageList(cast_tensor, self.image_sizes) + + +def to_image_list(tensors, size_divisible=32): + # Preprocessing + if isinstance(tensors, Tensor) and size_divisible > 0: + tensors = [tensors] + + if isinstance(tensors, ImageList): + return tensors + elif isinstance(tensors, Tensor): + # single tensor shape can be inferred + assert tensors.ndim == 4 + image_sizes = [tensor.shape[-2:] for tensor in tensors] + return ImageList(tensors, image_sizes) + elif isinstance(tensors, (tuple, list)): + max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) + if size_divisible > 0: + + stride = size_divisible + max_size = list(max_size) + max_size[1] = int(math.ceil(max_size[1] / stride) * stride) + max_size[2] = int(math.ceil(max_size[2] / stride) * stride) + max_size = tuple(max_size) + + batch_shape = (len(tensors),) + max_size + batched_imgs = np.zeros(batch_shape, dtype=tensors[0].dtype.np) + for img, pad_img in zip(tensors, batched_imgs): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]] += img.numpy() + + batched_imgs = Tensor(batched_imgs) + image_sizes = [im.shape[-2:] for im in tensors] + + return ImageList(batched_imgs, image_sizes) + else: + raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) + + +class MaskRCNN: + def __init__(self, backbone: ResNet): + self.backbone = ResNetFPN(backbone, out_channels=256) + self.rpn = RPN(self.backbone.out_channels) + self.roi_heads = RoIHeads(self.backbone.out_channels) + + def load_from_pretrained(self): + fn = Path('./') / "weights/maskrcnn.pt" + download_file("https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth", fn) + + state_dict = torch_load(fn)['model'] + loaded_keys = [] + for k, v in state_dict.items(): + if "module." in k: + k = k.replace("module.", "") + if "stem." in k: + k = k.replace("stem.", "") + if "fpn_inner" in k: + block_index = int(re.search(r"fpn_inner(\d+)", k).group(1)) + k = re.sub(r"fpn_inner\d+", f"inner_blocks.{block_index - 1}", k) + if "fpn_layer" in k: + block_index = int(re.search(r"fpn_layer(\d+)", k).group(1)) + k = re.sub(r"fpn_layer\d+", f"layer_blocks.{block_index - 1}", k) + loaded_keys.append(k) + get_child(self, k).assign(v.numpy()).realize() + return loaded_keys + + def __call__(self, images): + images = to_image_list(images) + features = self.backbone(images.tensors) + proposals, _ = self.rpn(images, features) + x, result, _ = self.roi_heads(features, proposals) + return result + + +if __name__ == '__main__': + resnet = resnet = ResNet(50, num_classes=None, stride_in_1x1=True) + model = MaskRCNN(backbone=resnet) + model.load_from_pretrained() diff --git a/models/resnet.py b/models/resnet.py index 8bc955a3..b5a756c0 100644 --- a/models/resnet.py +++ b/models/resnet.py @@ -27,14 +27,15 @@ class BasicBlock: class Bottleneck: - # NOTE: the original implementation places stride at the first convolution (self.conv1), this is the v1.5 variant + # NOTE: stride_in_1x1=False, this is the v1.5 variant expansion = 4 - def __init__(self, in_planes, planes, stride=1, groups=1, base_width=64): + def __init__(self, in_planes, planes, stride=1, stride_in_1x1=False, groups=1, base_width=64): width = int(planes * (base_width / 64.0)) * groups - self.conv1 = nn.Conv2d(in_planes, width, kernel_size=1, bias=False) + # NOTE: the original implementation places stride at the first convolution (self.conv1), control with stride_in_1x1 + self.conv1 = nn.Conv2d(in_planes, width, kernel_size=1, stride=stride if stride_in_1x1 else 1, bias=False) self.bn1 = nn.BatchNorm2d(width) - self.conv2 = nn.Conv2d(width, width, kernel_size=3, padding=1, stride=stride, groups=groups, bias=False) + self.conv2 = nn.Conv2d(width, width, kernel_size=3, padding=1, stride=1 if stride_in_1x1 else stride, groups=groups, bias=False) self.bn2 = nn.BatchNorm2d(width) self.conv3 = nn.Conv2d(width, self.expansion*planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*planes) @@ -54,9 +55,8 @@ class Bottleneck: return out class ResNet: - def __init__(self, num, num_classes, groups=1, width_per_group=64): + def __init__(self, num, num_classes=None, groups=1, width_per_group=64, stride_in_1x1=False): self.num = num - self.block = { 18: BasicBlock, 34: BasicBlock, @@ -79,30 +79,41 @@ class ResNet: self.base_width = width_per_group self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, bias=False, padding=3) self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(self.block, 64, self.num_blocks[0], stride=1) - self.layer2 = self._make_layer(self.block, 128, self.num_blocks[1], stride=2) - self.layer3 = self._make_layer(self.block, 256, self.num_blocks[2], stride=2) - self.layer4 = self._make_layer(self.block, 512, self.num_blocks[3], stride=2) - self.fc = nn.Linear(512 * self.block.expansion, num_classes) + self.layer1 = self._make_layer(self.block, 64, self.num_blocks[0], stride=1, stride_in_1x1=stride_in_1x1) + self.layer2 = self._make_layer(self.block, 128, self.num_blocks[1], stride=2, stride_in_1x1=stride_in_1x1) + self.layer3 = self._make_layer(self.block, 256, self.num_blocks[2], stride=2, stride_in_1x1=stride_in_1x1) + self.layer4 = self._make_layer(self.block, 512, self.num_blocks[3], stride=2, stride_in_1x1=stride_in_1x1) + self.fc = nn.Linear(512 * self.block.expansion, num_classes) if num_classes is not None else None - def _make_layer(self, block, planes, num_blocks, stride): + def _make_layer(self, block, planes, num_blocks, stride, stride_in_1x1): strides = [stride] + [1] * (num_blocks-1) layers = [] for stride in strides: - layers.append(block(self.in_planes, planes, stride, self.groups, self.base_width)) + if block == Bottleneck: + layers.append(block(self.in_planes, planes, stride, stride_in_1x1, self.groups, self.base_width)) + else: + layers.append(block(self.in_planes, planes, stride, self.groups, self.base_width)) self.in_planes = planes * block.expansion return layers def forward(self, x): + is_feature_only = self.fc is None + if is_feature_only: features = [] out = self.bn1(self.conv1(x)).relu() out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2) out = out.sequential(self.layer1) + if is_feature_only: features.append(out) out = out.sequential(self.layer2) + if is_feature_only: features.append(out) out = out.sequential(self.layer3) + if is_feature_only: features.append(out) out = out.sequential(self.layer4) - out = out.mean([2,3]) - out = self.fc(out).log_softmax() - return out + if is_feature_only: features.append(out) + if not is_feature_only: + out = out.mean([2,3]) + out = self.fc(out).log_softmax() + return out + return features def __call__(self, x): return self.forward(x) @@ -140,4 +151,4 @@ ResNet34 = lambda num_classes=1000: ResNet(34, num_classes=num_classes) ResNet50 = lambda num_classes=1000: ResNet(50, num_classes=num_classes) ResNet101 = lambda num_classes=1000: ResNet(101, num_classes=num_classes) ResNet152 = lambda num_classes=1000: ResNet(152, num_classes=num_classes) -ResNeXt50_32X4D = lambda num_classes=1000: ResNet(50, num_classes=num_classes, groups=32, width_per_group=4) +ResNeXt50_32X4D = lambda num_classes=1000: ResNet(50, num_classes=num_classes, groups=32, width_per_group=4) \ No newline at end of file diff --git a/test/test_ops.py b/test/test_ops.py index 2d21180b..db0952d1 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -243,6 +243,9 @@ class TestOps(unittest.TestCase): def test_log(self): helper_test_op([(45,65)], lambda x: torch.log(x), Tensor.log) helper_test_op([()], lambda x: torch.log(x), Tensor.log) + def test_log2(self): + helper_test_op([(45,65)], lambda x: torch.log2(x), Tensor.log2) + helper_test_op([()], lambda x: torch.log2(x), Tensor.log2) def test_exp(self): helper_test_op([(45,65)], lambda x: torch.exp(x), Tensor.exp) helper_test_op([()], lambda x: torch.exp(x), Tensor.exp) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 31da9886..d3f29b2b 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -7,7 +7,7 @@ import operator import numpy as np from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence, cast from tinygrad.helpers import ImageDType, argfix, make_pair, getenv, IMAGE, DEBUG, flatten, DType, dtypes -from math import ceil, pi, prod, sqrt +from math import ceil, pi, prod, sqrt, log from tinygrad.lazy import Device, LazyBuffer from tinygrad.ops import LoadOps @@ -481,6 +481,7 @@ class Tensor: def contiguous(self): return mlops.Contiguous.apply(self) def log(self): return mlops.Log.apply(self) + def log2(self): return mlops.Log.apply(self)/log(2) def exp(self): return mlops.Exp.apply(self) def relu(self): return mlops.Relu.apply(self) def sin(self): return mlops.Sin.apply(self)