diff --git a/data/test/images/image_voc2007_000001.jpg b/data/test/images/image_voc2007_000001.jpg new file mode 100644 index 00000000..c60f921e --- /dev/null +++ b/data/test/images/image_voc2007_000001.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0bdad67d01aa452929683b74a124a2926b6bce534c85f3ee0f00e20eeacab0 +size 78771 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 3c4724aa..2bb0b470 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -76,6 +76,7 @@ class Models(object): image_casmvs_depth_estimation = 'image-casmvs-depth-estimation' vop_retrieval_model = 'vop-retrieval-model' ddcolor = 'ddcolor' + defrcn = 'defrcn' image_face_fusion = 'image-face-fusion' # EasyCV models @@ -296,6 +297,7 @@ class Pipelines(object): image_multi_view_depth_estimation = 'image-multi-view-depth-estimation' vop_retrieval = 'vop-video-text-retrieval' ddcolor_image_colorization = 'ddcolor-image-colorization' + image_fewshot_detection = 'image-fewshot-detection' image_face_fusion = 'image-face-fusion' # nlp tasks @@ -416,6 +418,7 @@ class Trainers(object): referring_video_object_segmentation = 'referring-video-object-segmentation' image_classification_team = 'image-classification-team' image_classification = 'image-classification' + image_fewshot_detection = 'image-fewshot-detection' # nlp trainers bert_sentiment_analysis = 'bert-sentiment-analysis' diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index c0a0dc42..b906aa12 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -5,20 +5,20 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, body_3d_keypoints, cartoon, cmdssl_video_embedding, crowd_counting, face_2d_keypoints, face_detection, face_generation, human_wholebody_keypoint, image_classification, - image_color_enhance, image_colorization, image_denoise, - image_inpainting, image_instance_segmentation, image_matching, - image_mvs_depth_estimation, image_panoptic_segmentation, - image_portrait_enhancement, image_reid_person, - image_semantic_segmentation, image_to_image_generation, - image_to_image_translation, language_guided_video_summarization, - movie_scene_segmentation, object_detection, - panorama_depth_estimation, pointcloud_sceneflow_estimation, - product_retrieval_embedding, realtime_object_detection, - referring_video_object_segmentation, salient_detection, - shop_segmentation, super_resolution, video_frame_interpolation, - video_object_segmentation, video_single_object_tracking, - video_stabilization, video_summarization, - video_super_resolution, virual_tryon, vision_middleware, - vop_retrieval) + image_color_enhance, image_colorization, image_defrcn_fewshot, + image_denoise, image_inpainting, image_instance_segmentation, + image_matching, image_mvs_depth_estimation, + image_panoptic_segmentation, image_portrait_enhancement, + image_reid_person, image_semantic_segmentation, + image_to_image_generation, image_to_image_translation, + language_guided_video_summarization, movie_scene_segmentation, + object_detection, panorama_depth_estimation, + pointcloud_sceneflow_estimation, product_retrieval_embedding, + realtime_object_detection, referring_video_object_segmentation, + salient_detection, shop_segmentation, super_resolution, + video_frame_interpolation, video_object_segmentation, + video_single_object_tracking, video_stabilization, + video_summarization, video_super_resolution, virual_tryon, + vision_middleware, vop_retrieval) # yapf: enable diff --git a/modelscope/models/cv/image_defrcn_fewshot/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/__init__.py new file mode 100644 index 00000000..ef73351a --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .defrcn_for_fewshot import DeFRCNForFewShot + +else: + _import_structure = {'defrcn_for_fewshot': ['DeFRCNForFewShot']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py new file mode 100644 index 00000000..d42e59b2 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py @@ -0,0 +1,80 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .models.defaults_config import _C +from .models.defrcn import DeFRCN +from .utils.requirements_check import requires_version + +logger = get_logger() +__all__ = ['DeFRCNForFewShot'] + + +@MODELS.register_module( + Tasks.image_fewshot_detection, module_name=Models.defrcn) +class DeFRCNForFewShot(TorchModel): + """ Few-shot object detection model DeFRCN. The model requires detectron2-0.3 and pytorch-1.11. + Model config params mainly from detectron2, you can use detectron2 config file to initialize model. + Detail configs can be visited on detectron2.config.defaults and .models.defaults_config. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the few-shot defrcn model from the `model_dir` path. + + Args: + model_dir (str): the model path. + + """ + requires_version() + + super().__init__(model_dir, *args, **kwargs) + + self.model_dir = model_dir + self.config = Config.from_file( + os.path.join(self.model_dir, ModelFile.CONFIGURATION)) + + if 'config_path' in kwargs: + self.config.merge_from_dict( + {'model.config_path': kwargs['config_path']}) + + self.model_cfg = _C.clone() + self.model_cfg.merge_from_file( + os.path.join(model_dir, self.config.model.config_path)) + + if 'model_weights' in kwargs: + self.model_cfg.merge_from_list( + ['MODEL.WEIGHTS', kwargs['model_weights']]) + + self.model_cfg.freeze() + + self.model = DeFRCN(self.model_cfg) + + def forward(self, inputs) -> Any: + """return the result by the model + + Args: + inputs (list): the preprocessed data + + Returns: + Any: results + """ + if self.training: + return self.model.forward(inputs) + else: + return self.model.inference(inputs) + + def inference(self, input: Dict[str, Any]) -> Any: + with torch.no_grad(): + results = self.model([input]) + return results[0] if len(results) > 0 else None + + def get_model_cfg(self): + return self.model_cfg diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py new file mode 100644 index 00000000..d463e460 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .defrcn import DeFRCN + +else: + _import_structure = {'defrcn': ['DeFRCN']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py new file mode 100644 index 00000000..55fcc43b --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py @@ -0,0 +1,38 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py + +from detectron2.config.defaults import _C + +_CC = _C + +# ----------- Backbone ----------- # +_CC.MODEL.BACKBONE.FREEZE = False +_CC.MODEL.BACKBONE.FREEZE_AT = 3 + +# ------------- RPN -------------- # +_CC.MODEL.RPN.FREEZE = False +_CC.MODEL.RPN.ENABLE_DECOUPLE = False +_CC.MODEL.RPN.BACKWARD_SCALE = 1.0 + +# ------------- ROI -------------- # +_CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads' +_CC.MODEL.ROI_HEADS.FREEZE_FEAT = False +_CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False +_CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0 +_CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers' +_CC.MODEL.ROI_HEADS.CLS_DROPOUT = False +_CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8 +_CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7 # for faster + +# ------------- TEST ------------- # +_CC.TEST.PCB_ENABLE = False +_CC.TEST.PCB_MODELTYPE = 'resnet' # res-like +_CC.TEST.PCB_MODELPATH = '' +_CC.TEST.PCB_ALPHA = 0.50 +_CC.TEST.PCB_UPPER = 1.0 +_CC.TEST.PCB_LOWER = 0.05 + +# ------------ Other ------------- # +_CC.SOLVER.WEIGHT_DECAY = 5e-5 +_CC.MUTE_HEADER = True diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py b/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py new file mode 100644 index 00000000..a5258017 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py @@ -0,0 +1,179 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py + +import os +from typing import Dict + +import torch +from detectron2.layers import ShapeSpec +from detectron2.modeling.anchor_generator import DefaultAnchorGenerator +from detectron2.modeling.backbone.resnet import build_resnet_backbone +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.postprocessing import detector_postprocess +from detectron2.modeling.proposal_generator.rpn import RPN, StandardRPNHead +from detectron2.structures import ImageList +from torch import nn + +from .gdl import AffineLayer, decouple_layer +from .roi_heads import Res5ROIHeads + + +class DeFRCN(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + + self.device = torch.device(cfg.MODEL.DEVICE) + + self.backbone = build_resnet_backbone( + cfg, ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) + self._SHAPE_ = self.backbone.output_shape() + + rpn_config = DeFRCN.from_rpn_config(cfg, self._SHAPE_) + self.proposal_generator = RPN(**rpn_config) + + self.roi_heads = Res5ROIHeads(cfg, self._SHAPE_) + self.normalizer = self.normalize_fn() + self.affine_rpn = AffineLayer( + num_channels=self._SHAPE_['res4'].channels, bias=True) + self.affine_rcnn = AffineLayer( + num_channels=self._SHAPE_['res4'].channels, bias=True) + self.to(self.device) + + if cfg.MODEL.BACKBONE.FREEZE: + for p in self.backbone.parameters(): + p.requires_grad = False + + if cfg.MODEL.RPN.FREEZE: + for p in self.proposal_generator.parameters(): + p.requires_grad = False + + if cfg.MODEL.ROI_HEADS.FREEZE_FEAT: + for p in self.roi_heads.res5.parameters(): + p.requires_grad = False + + def forward(self, batched_inputs): + if not self.training: + return self.inference(batched_inputs) + assert 'instances' in batched_inputs[0] + gt_instances = [x['instances'].to(self.device) for x in batched_inputs] + proposal_losses, detector_losses, _, _ = self._forward_once_( + batched_inputs, gt_instances) + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + def inference(self, batched_inputs): + assert not self.training + _, _, results, image_sizes = self._forward_once_(batched_inputs, None) + processed_results = [] + for r, input, image_size in zip(results, batched_inputs, image_sizes): + height = input.get('height', image_size[0]) + width = input.get('width', image_size[1]) + r = detector_postprocess(r, height, width) + processed_results.append({'instances': r}) + return processed_results + + def _forward_once_(self, batched_inputs, gt_instances=None): + + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + + features_de_rpn = features + if self.cfg.MODEL.RPN.ENABLE_DECOUPLE: + scale = self.cfg.MODEL.RPN.BACKWARD_SCALE + features_de_rpn = { + k: self.affine_rpn(decouple_layer(features[k], scale)) + for k in features + } + proposals, proposal_losses = self.proposal_generator( + images, features_de_rpn, gt_instances) + + features_de_rcnn = features + if self.cfg.MODEL.ROI_HEADS.ENABLE_DECOUPLE: + scale = self.cfg.MODEL.ROI_HEADS.BACKWARD_SCALE + features_de_rcnn = { + k: self.affine_rcnn(decouple_layer(features[k], scale)) + for k in features + } + results, detector_losses = self.roi_heads(images, features_de_rcnn, + proposals, gt_instances) + + return proposal_losses, detector_losses, results, images.image_sizes + + def preprocess_image(self, batched_inputs): + images = [x['image'].to(self.device) for x in batched_inputs] + images = [self.normalizer(x) for x in images] + images = ImageList.from_tensors(images, + self.backbone.size_divisibility) + return images + + def normalize_fn(self): + assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD) + num_channels = len(self.cfg.MODEL.PIXEL_MEAN) + pixel_mean = ( + torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).to(self.device).view( + num_channels, 1, 1)) + pixel_std = ( + torch.Tensor(self.cfg.MODEL.PIXEL_STD).to(self.device).view( + num_channels, 1, 1)) + return lambda x: (x - pixel_mean) / pixel_std + + @classmethod + def from_rpn_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + in_features = cfg.MODEL.RPN.IN_FEATURES + ret = { + 'in_features': + in_features, + 'min_box_size': + cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE, + 'nms_thresh': + cfg.MODEL.RPN.NMS_THRESH, + 'batch_size_per_image': + cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, + 'positive_fraction': + cfg.MODEL.RPN.POSITIVE_FRACTION, + 'loss_weight': { + 'loss_rpn_cls': + cfg.MODEL.RPN.LOSS_WEIGHT, + 'loss_rpn_loc': + cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT, + }, + 'anchor_boundary_thresh': + cfg.MODEL.RPN.BOUNDARY_THRESH, + 'box2box_transform': + Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS), + 'box_reg_loss_type': + cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE, + 'smooth_l1_beta': + cfg.MODEL.RPN.SMOOTH_L1_BETA, + } + + ret['pre_nms_topk'] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, + cfg.MODEL.RPN.PRE_NMS_TOPK_TEST) + ret['post_nms_topk'] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, + cfg.MODEL.RPN.POST_NMS_TOPK_TEST) + + # ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features]) + anchor_cfg = DefaultAnchorGenerator.from_config( + cfg, [input_shape[f] for f in in_features]) + ret['anchor_generator'] = DefaultAnchorGenerator(**anchor_cfg) + ret['anchor_matcher'] = Matcher( + cfg.MODEL.RPN.IOU_THRESHOLDS, + cfg.MODEL.RPN.IOU_LABELS, + allow_low_quality_matches=True) + rpn_head_cfg = { + 'in_channels': + [s.channels for s in [input_shape[f] for f in in_features]][0], + 'num_anchors': + ret['anchor_generator'].num_anchors[0], + 'box_dim': + ret['anchor_generator'].box_dim + } + + ret['head'] = StandardRPNHead(**rpn_head_cfg) + return ret diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py new file mode 100644 index 00000000..9415b5a6 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py @@ -0,0 +1,274 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py + +import numpy as np +import torch +from detectron2.layers import batched_nms, cat +from detectron2.modeling.roi_heads.fast_rcnn import \ + fast_rcnn_inference_single_image +from detectron2.utils.events import get_event_storage +from fvcore.nn import smooth_l1_loss +from torch import nn +from torch.nn import functional as F + + +def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh, + topk_per_image): + + result_per_image = [ + fast_rcnn_inference_single_image( + boxes_per_image, + scores_per_image, + image_shape, + score_thresh, + nms_thresh, + topk_per_image, + ) for scores_per_image, boxes_per_image, image_shape in zip( + scores, boxes, image_shapes) + ] + return tuple(list(x) for x in zip(*result_per_image)) + + +class FastRCNNOutputs(object): + """ + A class that stores information about outputs of a Fast R-CNN head. + """ + + def __init__( + self, + box2box_transform, + pred_class_logits, + pred_proposal_deltas, + proposals, + smooth_l1_beta, + ): + """ + Args: + box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): + box2box transform instance for proposal-to-detection transformations. + pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class + logits for all R predicted object instances. + Each row corresponds to a predicted object instance. + pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for + class-specific or class-agnostic regression. It stores the predicted deltas that + transform proposals into final box detections. + B is the box dimension (4 or 5). + When B is 4, each row is [dx, dy, dw, dh (, ....)]. + When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. + proposals (list[Instances]): A list of N Instances, where Instances i stores the + proposals for image i, in the field "proposal_boxes". + When training, each Instances must have ground-truth labels + stored in the field "gt_classes" and "gt_boxes". + smooth_l1_beta (float): The transition point between L1 and L2 loss in + the smooth L1 loss function. When set to 0, the loss becomes L1. When + set to +inf, the loss becomes constant 0. + """ + self.box2box_transform = box2box_transform + self.num_preds_per_image = [len(p) for p in proposals] + self.pred_class_logits = pred_class_logits + self.pred_proposal_deltas = pred_proposal_deltas + self.smooth_l1_beta = smooth_l1_beta + + box_type = type(proposals[0].proposal_boxes) + # cat(..., dim=0) concatenates over all images in the batch + self.proposals = box_type.cat([p.proposal_boxes for p in proposals]) + assert (not self.proposals.tensor.requires_grad + ), 'Proposals should not require gradients!' + self.image_shapes = [x.image_size for x in proposals] + + # The following fields should exist only when training. + if proposals[0].has('gt_boxes'): + self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) + assert proposals[0].has('gt_classes') + self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) + + def _log_accuracy(self): + """ + Log the accuracy metrics to EventStorage. + """ + num_instances = self.gt_classes.numel() + pred_classes = self.pred_class_logits.argmax(dim=1) + bg_class_ind = self.pred_class_logits.shape[1] - 1 + + fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind) + num_fg = fg_inds.nonzero().numel() + fg_gt_classes = self.gt_classes[fg_inds] + fg_pred_classes = pred_classes[fg_inds] + + num_false_negative = (( + fg_pred_classes == bg_class_ind).nonzero().numel()) + num_accurate = (pred_classes == self.gt_classes).nonzero().numel() + fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() + + storage = get_event_storage() + storage.put_scalar('fast_rcnn/cls_accuracy', + num_accurate / num_instances) + if num_fg > 0: + storage.put_scalar('fast_rcnn/fg_cls_accuracy', + fg_num_accurate / num_fg) + storage.put_scalar('fast_rcnn/false_negative', + num_false_negative / num_fg) + + def softmax_cross_entropy_loss(self): + """ + Compute the softmax cross entropy loss for box classification. + + Returns: + scalar Tensor + """ + self._log_accuracy() + return F.cross_entropy( + self.pred_class_logits, self.gt_classes, reduction='mean') + + def smooth_l1_loss(self): + """ + Compute the smooth L1 loss for box regression. + + Returns: + scalar Tensor + """ + gt_proposal_deltas = self.box2box_transform.get_deltas( + self.proposals.tensor, self.gt_boxes.tensor) + box_dim = gt_proposal_deltas.size(1) # 4 or 5 + cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim + device = self.pred_proposal_deltas.device + + bg_class_ind = self.pred_class_logits.shape[1] - 1 + + fg_inds = torch.nonzero((self.gt_classes >= 0) + & (self.gt_classes < bg_class_ind)).squeeze(1) + if cls_agnostic_bbox_reg: + # pred_proposal_deltas only corresponds to foreground class for agnostic + gt_class_cols = torch.arange(box_dim, device=device) + else: + fg_gt_classes = self.gt_classes[fg_inds] + gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( + box_dim, device=device) + + loss_box_reg = smooth_l1_loss( + self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], + gt_proposal_deltas[fg_inds], + self.smooth_l1_beta, + reduction='sum', + ) + + loss_box_reg = loss_box_reg / self.gt_classes.numel() + return loss_box_reg + + def losses(self): + """ + Compute the default losses for box head in Fast(er) R-CNN, + with softmax cross entropy loss and smooth L1 loss. + + Returns: + A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg". + """ + return { + 'loss_cls': self.softmax_cross_entropy_loss(), + 'loss_box_reg': self.smooth_l1_loss(), + } + + def predict_boxes(self): + """ + Returns: + list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes + for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is + the number of predicted objects for image i and B is the box dimension (4 or 5) + """ + num_pred = len(self.proposals) + B = self.proposals.tensor.shape[1] + K = self.pred_proposal_deltas.shape[1] // B + boxes = self.box2box_transform.apply_deltas( + self.pred_proposal_deltas.view(num_pred * K, B), + self.proposals.tensor.unsqueeze(1).expand(num_pred, K, + B).reshape(-1, B), + ) + return boxes.view(num_pred, K * B).split( + self.num_preds_per_image, dim=0) + + def predict_probs(self): + """ + Returns: + list[Tensor]: A list of Tensors of predicted class probabilities for each image. + Element i has shape (Ri, K + 1), where Ri is the number of predicted objects + for image i. + """ + probs = F.softmax(self.pred_class_logits, dim=-1) + return probs.split(self.num_preds_per_image, dim=0) + + def inference(self, score_thresh, nms_thresh, topk_per_image): + """ + Args: + score_thresh (float): same as fast_rcnn_inference. + nms_thresh (float): same as fast_rcnn_inference. + topk_per_image (int): same as fast_rcnn_inference. + Returns: + list[Instances]: same as fast_rcnn_inference. + list[Tensor]: same as fast_rcnn_inference. + """ + boxes = self.predict_boxes() + scores = self.predict_probs() + image_shapes = self.image_shapes + + return fast_rcnn_inference( + boxes, + scores, + image_shapes, + score_thresh, + nms_thresh, + topk_per_image, + ) + + +class FastRCNNOutputLayers(nn.Module): + """ + Two linear layers for predicting Fast R-CNN outputs: + (1) proposal-to-detection box regression deltas + (2) classification scores + """ + + def __init__(self, + cfg, + input_size, + num_classes, + cls_agnostic_bbox_reg, + box_dim=4): + """ + Args: + cfg: config + input_size (int): channels, or (channels, height, width) + num_classes (int): number of foreground classes + cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression + box_dim (int): the dimension of bounding boxes. + Example box dimensions: 4 for regular XYXY boxes and 5 for rotated XYWHA boxes + """ + super(FastRCNNOutputLayers, self).__init__() + + if not isinstance(input_size, int): + input_size = np.prod(input_size) + + # The prediction layer for num_classes foreground classes and one + # background class + self.cls_score = nn.Linear(input_size, num_classes + 1) + num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes + self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim) + + nn.init.normal_(self.cls_score.weight, std=0.01) + nn.init.normal_(self.bbox_pred.weight, std=0.001) + for b in [self.cls_score, self.bbox_pred]: + nn.init.constant_(b.bias, 0) + + self._do_cls_dropout = cfg.MODEL.ROI_HEADS.CLS_DROPOUT + self._dropout_ratio = cfg.MODEL.ROI_HEADS.DROPOUT_RATIO + + def forward(self, x): + if x.dim() > 2: + x = torch.flatten(x, start_dim=1) + proposal_deltas = self.bbox_pred(x) + + if self._do_cls_dropout: + x = F.dropout(x, self._dropout_ratio, training=self.training) + scores = self.cls_score(x) + + return scores, proposal_deltas diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py b/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py new file mode 100644 index 00000000..0d228fa7 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py @@ -0,0 +1,43 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/gdl.py + +import torch +import torch.nn as nn +from torch.autograd import Function + + +class GradientDecoupleLayer(Function): + + @staticmethod + def forward(ctx, x, _lambda): + ctx._lambda = _lambda + return x + + @staticmethod + def backward(ctx, grad_output): + grad_output = grad_output * ctx._lambda + return grad_output, None + + +class AffineLayer(nn.Module): + + def __init__(self, num_channels, bias=False): + super(AffineLayer, self).__init__() + weight = torch.FloatTensor(1, num_channels, 1, 1).fill_(1) + self.weight = nn.Parameter(weight, requires_grad=True) + + self.bias = None + if bias: + bias = torch.FloatTensor(1, num_channels, 1, 1).fill_(0) + self.bias = nn.Parameter(bias, requires_grad=True) + + def forward(self, X): + out = X * self.weight.expand_as(X) + if self.bias is not None: + out = out + self.bias.expand_as(X) + return out + + +def decouple_layer(x, _lambda): + return GradientDecoupleLayer.apply(x, _lambda) diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py new file mode 100644 index 00000000..9ac78119 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py @@ -0,0 +1,302 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/roi_heads/roi_heads.py + +from typing import Dict + +import numpy as np +import torch +from detectron2.layers import ShapeSpec +from detectron2.modeling.backbone.resnet import BottleneckBlock, make_stage +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.poolers import ROIPooler +from detectron2.modeling.proposal_generator.proposal_utils import \ + add_ground_truth_to_proposals +from detectron2.modeling.roi_heads import select_foreground_proposals +from detectron2.modeling.sampling import subsample_labels +from detectron2.structures import Boxes, Instances, pairwise_iou +from detectron2.utils.events import get_event_storage +from torch import nn + +from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs + + +class ROIHeads(torch.nn.Module): + """ + ROIHeads perform all per-region computation in an R-CNN. + + It contains logic of cropping the regions, extract per-region features, + and make per-region predictions. + + It can have many variants, implemented as subclasses of this class. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): + super(ROIHeads, self).__init__() + + # fmt: off + self.batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE + self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST + self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST + self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE + self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES + self.proposal_append_gt = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT + self.feature_strides = {k: v.stride for k, v in input_shape.items()} + self.feature_channels = {k: v.channels for k, v in input_shape.items()} + self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG + self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA + # fmt: on + + # Matcher to assign box proposals to gt boxes + self.proposal_matcher = Matcher( + cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS, + cfg.MODEL.ROI_HEADS.IOU_LABELS, + allow_low_quality_matches=False, + ) + + # Box2BoxTransform for bounding box regression + self.box2box_transform = Box2BoxTransform( + weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) + + def _sample_proposals(self, matched_idxs, matched_labels, gt_classes): + """ + Based on the matching between N proposals and M groundtruth, + sample the proposals and set their classification labels. + + Args: + matched_idxs (Tensor): a vector of length N, each is the best-matched + gt index in [0, M) for each proposal. + matched_labels (Tensor): a vector of length N, the matcher's label + (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. + gt_classes (Tensor): a vector of length M. + + Returns: + Tensor: a vector of indices of sampled proposals. Each is in [0, N). + Tensor: a vector of the same length, the classification label for + each sampled proposal. Each sample is labeled as either a category in + [0, num_classes) or the background (num_classes). + """ + has_gt = gt_classes.numel() > 0 + # Get the corresponding GT for each proposal + if has_gt: + gt_classes = gt_classes[matched_idxs] + # Label unmatched proposals (0 label from matcher) as background (label=num_classes) + gt_classes[matched_labels == 0] = self.num_classes + # Label ignore proposals (-1 label) + gt_classes[matched_labels == -1] = -1 + else: + gt_classes = torch.zeros_like(matched_idxs) + self.num_classes + + sampled_fg_idxs, sampled_bg_idxs = subsample_labels( + gt_classes, + self.batch_size_per_image, + self.positive_sample_fraction, + self.num_classes, + ) + + sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) + return sampled_idxs, gt_classes[sampled_idxs] + + @torch.no_grad() + def label_and_sample_proposals(self, proposals, targets): + """ + Prepare some proposals to be used to train the ROI heads. + It performs box matching between `proposals` and `targets`, and assigns + training labels to the proposals. + It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, + with a fraction of positives that is no larger than `self.positive_sample_fraction. + + Args: + See :meth:`ROIHeads.forward` + + Returns: + list[Instances]: + length `N` list of `Instances`s containing the proposals + sampled for training. Each `Instances` has the following fields: + - proposal_boxes: the proposal boxes + - gt_boxes: the ground-truth box that the proposal is assigned to + (this is only meaningful if the proposal has a label > 0; if label = 0 + then the ground-truth box is random) + Other fields such as "gt_classes" that's included in `targets`. + """ + gt_boxes = [x.gt_boxes for x in targets] + + if self.proposal_append_gt: + proposals = add_ground_truth_to_proposals(gt_boxes, proposals) + + proposals_with_gt = [] + + num_fg_samples = [] + num_bg_samples = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + has_gt = len(targets_per_image) > 0 + match_quality_matrix = pairwise_iou( + targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) + matched_idxs, matched_labels = self.proposal_matcher( + match_quality_matrix) + sampled_idxs, gt_classes = self._sample_proposals( + matched_idxs, matched_labels, targets_per_image.gt_classes) + + # Set target attributes of the sampled proposals: + proposals_per_image = proposals_per_image[sampled_idxs] + proposals_per_image.gt_classes = gt_classes + + # We index all the attributes of targets that start with "gt_" + # and have not been added to proposals yet (="gt_classes"). + if has_gt: + sampled_targets = matched_idxs[sampled_idxs] + + for ( + trg_name, + trg_value, + ) in targets_per_image.get_fields().items(): + if trg_name.startswith( + 'gt_') and not proposals_per_image.has(trg_name): + proposals_per_image.set(trg_name, + trg_value[sampled_targets]) + else: + gt_boxes = Boxes( + targets_per_image.gt_boxes.tensor.new_zeros( + (len(sampled_idxs), 4))) + proposals_per_image.gt_boxes = gt_boxes + + num_bg_samples.append( + (gt_classes == self.num_classes).sum().item()) + num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) + proposals_with_gt.append(proposals_per_image) + + # Log the number of fg/bg samples that are selected for training ROI heads + storage = get_event_storage() + storage.put_scalar('roi_head/num_fg_samples', np.mean(num_fg_samples)) + storage.put_scalar('roi_head/num_bg_samples', np.mean(num_bg_samples)) + + return proposals_with_gt + + def forward(self, images, features, proposals, targets=None): + """ + Args: + images (ImageList): + features (dict[str: Tensor]): input data as a mapping from feature + map name to tensor. Axis 0 represents the number of images `N` in + the input data; axes 1-3 are channels, height, and width, which may + vary between feature maps (e.g., if a feature pyramid is used). + proposals (list[Instances]): length `N` list of `Instances`s. The i-th + `Instances` contains object proposals for the i-th input image, + with fields "proposal_boxes" and "objectness_logits". + targets (list[Instances], optional): length `N` list of `Instances`s. The i-th + `Instances` contains the ground-truth per-instance annotations + for the i-th input image. Specify `targets` during training only. + It may have the following fields: + - gt_boxes: the bounding box of each instance. + - gt_classes: the label for each instance with a category ranging in [0, #class]. + + Returns: + results (list[Instances]): length `N` list of `Instances`s containing the + detected instances. Returned during inference only; may be [] + during training. + losses (dict[str: Tensor]): mapping from a named loss to a tensor + storing the loss. Used during training only. + """ + raise NotImplementedError() + + +class Res5ROIHeads(ROIHeads): + """ + The ROIHeads in a typical "C4" R-CNN model, where the heads share the + cropping and the per-region feature computation by a Res5 block. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) + + assert len(self.in_features) == 1 + + # fmt: off + pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], ) + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + + self.pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + + self.res5, out_channels = self._build_res5_block(cfg) + self.box_predictor = FastRCNNOutputLayers(cfg, out_channels, + self.num_classes, + self.cls_agnostic_bbox_reg) + + def _build_res5_block(self, cfg): + # fmt: off + stage_channel_factor = 2**3 # res5 is 8x res2 + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group * stage_channel_factor + out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor + stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 + norm = cfg.MODEL.RESNETS.NORM + assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \ + 'Deformable conv is not yet supported in res5 head.' + # fmt: on + + blocks = make_stage( + BottleneckBlock, + 3, + first_stride=2, + in_channels=out_channels // 2, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + norm=norm, + stride_in_1x1=stride_in_1x1, + ) + return nn.Sequential(*blocks), out_channels + + def _shared_roi_transform(self, features, boxes): + x = self.pooler(features, boxes) + x = self.res5(x) + return x + + def forward(self, images, features, proposals, targets=None): + """ + See :class:`ROIHeads.forward`. + """ + del images + + if self.training: + proposals = self.label_and_sample_proposals(proposals, targets) + del targets + + proposal_boxes = [x.proposal_boxes for x in proposals] + box_features = self._shared_roi_transform( + [features[f] for f in self.in_features], proposal_boxes) + feature_pooled = box_features.mean(dim=[2, 3]) # pooled to 1x1 + pred_class_logits, pred_proposal_deltas = self.box_predictor( + feature_pooled) + del feature_pooled + + outputs = FastRCNNOutputs( + self.box2box_transform, + pred_class_logits, + pred_proposal_deltas, + proposals, + self.smooth_l1_beta, + ) + + if self.training: + del features + losses = outputs.losses() + return [], losses + else: + pred_instances, _ = outputs.inference( + self.test_score_thresh, + self.test_nms_thresh, + self.test_detections_per_img, + ) + return pred_instances, {} diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py new file mode 100644 index 00000000..bc118ff2 --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py @@ -0,0 +1,81 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import importlib +import sys +from collections import OrderedDict + +from packaging import version + +from modelscope.utils.import_utils import _torch_available + +if sys.version_info < (3, 8): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata + +DETECTRON2_REQUIRED_VERSION = version.parse('0.3') + + +def is_detectron2_version_available(): + _detectron2_available = importlib.util.find_spec('detectron2') is not None + _detectron2_version_available = False + if _detectron2_available: + _detectron2_version = version.parse( + importlib_metadata.version('detectron2')) + _detectron2_version_available = (_detectron2_version.major, + _detectron2_version.minor) == ( + DETECTRON2_REQUIRED_VERSION.major, + DETECTRON2_REQUIRED_VERSION.minor) + + return _detectron2_version_available + + +TORCH_REQUIRED_VERSION = version.parse('1.11') + + +def is_torch_version_available(): + _torch_version_available = False + if _torch_available: + torch_version = version.parse(importlib_metadata.version('torch')) + _torch_version_available = (torch_version.major, + torch_version.minor) == ( + TORCH_REQUIRED_VERSION.major, + TORCH_REQUIRED_VERSION.minor) + + return _torch_version_available + + +DETECTRON2_IMPORT_ERROR = """ +{0} requires the detectron2-0.3 but it was not found in your environment. +You can install it from modelscope lib with pip: +`pip install detectron2==0.3` +""" + +TORCH_VERSION_IMPORT_ERROR = """ +{0} requires the torch-1.11 but it was not found in your environment. You can install it with pip: +`pip install torch==1.11` +""" + +REQUIREMENTS_MAAPING_VERSION = OrderedDict([ + ('detectron2-0.3', (is_detectron2_version_available, + DETECTRON2_IMPORT_ERROR)), + ('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)), +]) + +REQUIREMENTS = ['detectron2-0.3', 'torch-1.11'] + + +def requires_version(): + checks = [] + for req in REQUIREMENTS: + if req in REQUIREMENTS_MAAPING_VERSION: + check = REQUIREMENTS_MAAPING_VERSION[req] + else: + raise NotImplementedError('{} do not supported check'.format(req)) + checks.append(check) + + failed = [ + msg.format('DeFRCN') for available, msg in checks if not available() + ] + if failed: + raise ImportError(''.join(failed)) diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py new file mode 100644 index 00000000..7a94066e --- /dev/null +++ b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py @@ -0,0 +1,342 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/data/meta_voc.py + +import os +import xml.etree.ElementTree as ET + +import numpy as np +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode +from fvcore.common.file_io import PathManager + +# PASCAL VOC categories +PASCAL_VOC_ALL_CATEGORIES = { + 1: [ + 'aeroplane', + 'bicycle', + 'boat', + 'bottle', + 'car', + 'cat', + 'chair', + 'diningtable', + 'dog', + 'horse', + 'person', + 'pottedplant', + 'sheep', + 'train', + 'tvmonitor', + 'bird', + 'bus', + 'cow', + 'motorbike', + 'sofa', + ], + 2: [ + 'bicycle', + 'bird', + 'boat', + 'bus', + 'car', + 'cat', + 'chair', + 'diningtable', + 'dog', + 'motorbike', + 'person', + 'pottedplant', + 'sheep', + 'train', + 'tvmonitor', + 'aeroplane', + 'bottle', + 'cow', + 'horse', + 'sofa', + ], + 3: [ + 'aeroplane', + 'bicycle', + 'bird', + 'bottle', + 'bus', + 'car', + 'chair', + 'cow', + 'diningtable', + 'dog', + 'horse', + 'person', + 'pottedplant', + 'train', + 'tvmonitor', + 'boat', + 'cat', + 'motorbike', + 'sheep', + 'sofa', + ] +} + +PASCAL_VOC_NOVEL_CATEGORIES = { + 1: ['bird', 'bus', 'cow', 'motorbike', 'sofa'], + 2: ['aeroplane', 'bottle', 'cow', 'horse', 'sofa'], + 3: ['boat', 'cat', 'motorbike', 'sheep', 'sofa'] +} + +PASCAL_VOC_BASE_CATEGORIES = { + 1: [ + 'aeroplane', + 'bicycle', + 'boat', + 'bottle', + 'car', + 'cat', + 'chair', + 'diningtable', + 'dog', + 'horse', + 'person', + 'pottedplant', + 'sheep', + 'train', + 'tvmonitor', + ], + 2: [ + 'bicycle', + 'bird', + 'boat', + 'bus', + 'car', + 'cat', + 'chair', + 'diningtable', + 'dog', + 'motorbike', + 'person', + 'pottedplant', + 'sheep', + 'train', + 'tvmonitor', + ], + 3: [ + 'aeroplane', + 'bicycle', + 'bird', + 'bottle', + 'bus', + 'car', + 'chair', + 'cow', + 'diningtable', + 'dog', + 'horse', + 'person', + 'pottedplant', + 'train', + 'tvmonitor', + ] +} + + +def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str, + classnames: str): + """ + Load Pascal VOC detection annotations to Detectron2 format. + Args: + dirname: Contain "Annotations", "ImageSets", "JPEGImages" + split (str): one of "train", "test", "val", "trainval" + """ + is_shots = 'shot' in name + dicts = [] + if is_shots: + fileids = {} + # split_dir = os.path.join("datasets", "vocsplit") + split_dir = os.path.join(root, 'vocsplit') + shot = name.split('_')[-2].split('shot')[0] + seed = int(name.split('_seed')[-1]) + split_dir = os.path.join(split_dir, 'seed{}'.format(seed)) + for cls in classnames: + with PathManager.open( + os.path.join(split_dir, + 'box_{}shot_{}_train.txt'.format(shot, + cls))) as f: + fileids_ = np.loadtxt(f, dtype=np.str).tolist() + if isinstance(fileids_, str): + fileids_ = [fileids_] + fileids_ = [ + fid.split('/')[-1].split('.jpg')[0] for fid in fileids_ + ] + fileids[cls] = fileids_ + + for cls, fileids_ in fileids.items(): + dicts_ = [] + for fileid in fileids_: + year = '2012' if '_' in fileid else '2007' + # dirname = os.path.join("datasets", "VOC{}".format(year)) + # anno_file = os.path.join(dirname, "Annotations", fileid + ".xml") + # jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") + + dir_voc = os.path.join(root, 'VOC{}'.format(year)) + anno_file = os.path.join(dir_voc, 'Annotations', + fileid + '.xml') + jpeg_file = os.path.join(dir_voc, 'JPEGImages', + fileid + '.jpg') + + tree = ET.parse(anno_file) + + for obj in tree.findall('object'): + r = { + 'file_name': jpeg_file, + 'image_id': fileid, + 'height': int(tree.findall('./size/height')[0].text), + 'width': int(tree.findall('./size/width')[0].text), + } + cls_ = obj.find('name').text + if cls != cls_: + continue + bbox = obj.find('bndbox') + bbox = [ + float(bbox.find(x).text) + for x in ['xmin', 'ymin', 'xmax', 'ymax'] + ] + bbox[0] -= 1.0 + bbox[1] -= 1.0 + + instances = [{ + 'category_id': classnames.index(cls), + 'bbox': bbox, + 'bbox_mode': BoxMode.XYXY_ABS, + }] + r['annotations'] = instances + dicts_.append(r) + if len(dicts_) > int(shot): + dicts_ = np.random.choice(dicts_, int(shot), replace=False) + dicts.extend(dicts_) + else: + with PathManager.open( + os.path.join(root, dirname, 'ImageSets', 'Main', + split + '.txt')) as f: + fileids = np.loadtxt(f, dtype=np.str) + + for fileid in fileids: + anno_file = os.path.join(root, dirname, 'Annotations', + fileid + '.xml') + jpeg_file = os.path.join(root, dirname, 'JPEGImages', + fileid + '.jpg') + + tree = ET.parse(anno_file) + + r = { + 'file_name': jpeg_file, + 'image_id': fileid, + 'height': int(tree.findall('./size/height')[0].text), + 'width': int(tree.findall('./size/width')[0].text), + } + instances = [] + + for obj in tree.findall('object'): + cls = obj.find('name').text + if not (cls in classnames): + continue + bbox = obj.find('bndbox') + bbox = [ + float(bbox.find(x).text) + for x in ['xmin', 'ymin', 'xmax', 'ymax'] + ] + bbox[0] -= 1.0 + bbox[1] -= 1.0 + + instances.append({ + 'category_id': classnames.index(cls), + 'bbox': bbox, + 'bbox_mode': BoxMode.XYXY_ABS, + }) + r['annotations'] = instances + dicts.append(r) + + return dicts + + +def register_meta_voc(name, root, dirname, split, year, keepclasses, sid): + if keepclasses.startswith('base_novel'): + thing_classes = PASCAL_VOC_ALL_CATEGORIES[sid] + elif keepclasses.startswith('base'): + thing_classes = PASCAL_VOC_BASE_CATEGORIES[sid] + elif keepclasses.startswith('novel'): + thing_classes = PASCAL_VOC_NOVEL_CATEGORIES[sid] + + DatasetCatalog.register( + name, + lambda: load_filtered_voc_instances(name, root, dirname, split, + thing_classes), + ) + + MetadataCatalog.get(name).set( + thing_classes=thing_classes, + dirname=os.path.join(root, dirname), + year=year, + split=split, + base_classes=PASCAL_VOC_BASE_CATEGORIES[sid], + novel_classes=PASCAL_VOC_NOVEL_CATEGORIES[sid], + ) + + +def register_all_voc(root='datasets'): + + METASPLITS = [ + ('voc_2007_trainval_base1', 'VOC2007', 'trainval', 'base1', 1), + ('voc_2007_trainval_base2', 'VOC2007', 'trainval', 'base2', 2), + ('voc_2007_trainval_base3', 'VOC2007', 'trainval', 'base3', 3), + ('voc_2012_trainval_base1', 'VOC2012', 'trainval', 'base1', 1), + ('voc_2012_trainval_base2', 'VOC2012', 'trainval', 'base2', 2), + ('voc_2012_trainval_base3', 'VOC2012', 'trainval', 'base3', 3), + ('voc_2007_trainval_all1', 'VOC2007', 'trainval', 'base_novel_1', 1), + ('voc_2007_trainval_all2', 'VOC2007', 'trainval', 'base_novel_2', 2), + ('voc_2007_trainval_all3', 'VOC2007', 'trainval', 'base_novel_3', 3), + ('voc_2012_trainval_all1', 'VOC2012', 'trainval', 'base_novel_1', 1), + ('voc_2012_trainval_all2', 'VOC2012', 'trainval', 'base_novel_2', 2), + ('voc_2012_trainval_all3', 'VOC2012', 'trainval', 'base_novel_3', 3), + ('voc_2007_test_base1', 'VOC2007', 'test', 'base1', 1), + ('voc_2007_test_base2', 'VOC2007', 'test', 'base2', 2), + ('voc_2007_test_base3', 'VOC2007', 'test', 'base3', 3), + ('voc_2007_test_novel1', 'VOC2007', 'test', 'novel1', 1), + ('voc_2007_test_novel2', 'VOC2007', 'test', 'novel2', 2), + ('voc_2007_test_novel3', 'VOC2007', 'test', 'novel3', 3), + ('voc_2007_test_all1', 'VOC2007', 'test', 'base_novel_1', 1), + ('voc_2007_test_all2', 'VOC2007', 'test', 'base_novel_2', 2), + ('voc_2007_test_all3', 'VOC2007', 'test', 'base_novel_3', 3), + ] + for prefix in ['all', 'novel']: + for sid in range(1, 4): + for shot in [1, 2, 3, 5, 10]: + for year in [2007, 2012]: + for seed in range(30): + seed = '_seed{}'.format(seed) + name = 'voc_{}_trainval_{}{}_{}shot{}'.format( + year, prefix, sid, shot, seed) + dirname = 'VOC{}'.format(year) + img_file = '{}_{}shot_split_{}_trainval'.format( + prefix, shot, sid) + keepclasses = ('base_novel_{}'.format(sid) if prefix + == 'all' else 'novel{}'.format(sid)) + METASPLITS.append( + (name, dirname, img_file, keepclasses, sid)) + + for name, dirname, split, keepclasses, sid in METASPLITS: + if name in DatasetCatalog: + continue + + year = 2007 if '2007' in name else 2012 + register_meta_voc( + name, + root, + dirname, + split, + year, + keepclasses, + sid, + ) + MetadataCatalog.get(name).evaluator_type = 'pascal_voc' diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py index 88811011..eab76cb3 100644 --- a/modelscope/pipeline_inputs.py +++ b/modelscope/pipeline_inputs.py @@ -82,6 +82,8 @@ TASK_INPUTS = { InputType.IMAGE, Tasks.portrait_matting: InputType.IMAGE, + Tasks.image_fewshot_detection: + InputType.IMAGE, # image editing task result for a single image Tasks.skin_retouching: diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index d82834ed..951d201c 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -269,6 +269,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_multi_view_depth_estimation: ( Pipelines.image_multi_view_depth_estimation, 'damo/cv_casmvs_multi-view-depth-estimation_general'), + Tasks.image_fewshot_detection: ( + Pipelines.image_fewshot_detection, + 'damo/cv_resnet101_detection_fewshot-defrcn'), Tasks.image_body_reshaping: (Pipelines.image_body_reshaping, 'damo/cv_flow-based-body-reshaping_damo'), Tasks.image_face_fusion: (Pipelines.image_face_fusion, diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 6139d2ba..a38e4283 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -83,6 +83,7 @@ if TYPE_CHECKING: from .image_mvs_depth_estimation_pipeline import ImageMultiViewDepthEstimationPipeline from .panorama_depth_estimation_pipeline import PanoramaDepthEstimationPipeline from .ddcolor_image_colorization_pipeline import DDColorImageColorizationPipeline + from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline else: _import_structure = { @@ -197,6 +198,7 @@ else: 'ddcolor_image_colorization_pipeline': [ 'DDColorImageColorizationPipeline' ], + 'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'], } import sys diff --git a/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py new file mode 100644 index 00000000..ccd6eb8e --- /dev/null +++ b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py @@ -0,0 +1,104 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict + +import numpy as np +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks + + +@PIPELINES.register_module( + Tasks.image_fewshot_detection, + module_name=Pipelines.image_fewshot_detection) +class ImageDefrcnDetectionPipeline(Pipeline): + """ Image DeFRCN few-shot detection Pipeline. Given a image, + pipeline will return the detection results on the image. + Example: + + ```python + >>> from modelscope.pipelines import pipeline + >>> detector = pipeline('image-fewshot-detection', 'damo/cv_resnet101_detection_fewshot-defrcn') + >>> detector('/Path/Image') + { + 'scores': [0.8307567834854126, 0.1606406420469284], + 'labels': ['person', 'dog'], + 'boxes': [ + [27.391937255859375, 0.0, 353.0, 500.0], + [64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375] + ] + } + >>> # + ``` + """ + + def __init__(self, model: str, **kwargs): + """ + model: model id on modelscope hub. + """ + super().__init__(model=model, auto_collate=False, **kwargs) + + model_path = os.path.join(self.model.model_dir, + ModelFile.TORCH_MODEL_FILE) + self.model.model = self._load_pretrained( + self.model.model, model_path, self.model.model_cfg.MODEL.DEVICE) + + def _load_pretrained(self, net, load_path, device='cuda', strict=True): + + load_net = torch.load(load_path, map_location=device) + if 'scheduler' in load_net: + del load_net['scheduler'] + if 'optimizer' in load_net: + del load_net['optimizer'] + if 'iteration' in load_net: + del load_net['iteration'] + net.load_state_dict(load_net['model'], strict=strict) + + return net + + def preprocess(self, input: Input) -> Dict[str, Any]: + + img = LoadImage.convert_to_ndarray(input) + img = img.astype(np.float) + + image = img[..., ::-1].copy() # rgb to bgr + tim = torch.Tensor(image).permute(2, 0, 1) + + result = {'image': tim} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + + outputs = self.model.inference(input) + result = {'data': outputs} + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + if inputs['data'] is None: + outputs = { + OutputKeys.SCORES: [], + OutputKeys.LABELS: [], + OutputKeys.BOXES: [] + } + return outputs + + objects = inputs['data']['instances'].get_fields() + labels, bboxes = [], [] + for label, box in zip(objects['pred_classes'], objects['pred_boxes']): + labels.append(self.model.config.model.classes[label]) + bboxes.append(box.tolist()) + + scores = objects['scores'].tolist() + + outputs = { + OutputKeys.SCORES: scores, + OutputKeys.LABELS: labels, + OutputKeys.BOXES: bboxes + } + return outputs diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py index 32c38de2..2f682b81 100644 --- a/modelscope/trainers/cv/__init__.py +++ b/modelscope/trainers/cv/__init__.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer from .image_inpainting_trainer import ImageInpaintingTrainer from .referring_video_object_segmentation_trainer import ReferringVideoObjectSegmentationTrainer + from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer else: _import_structure = { @@ -20,7 +21,9 @@ else: 'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'], 'image_inpainting_trainer': ['ImageInpaintingTrainer'], 'referring_video_object_segmentation_trainer': - ['ReferringVideoObjectSegmentationTrainer'] + ['ReferringVideoObjectSegmentationTrainer'], + 'image_defrcn_fewshot_detection_trainer': + ['ImageDefrcnFewshotTrainer'] } import sys diff --git a/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py new file mode 100644 index 00000000..04b2967a --- /dev/null +++ b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py @@ -0,0 +1,316 @@ +# The implementation is adopted from er-muyue/DeFRCN +# made publicly available under the MIT License at +# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/engine/defaults.py +# https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py + +import os +from typing import Callable, Optional, Union + +import torch +from detectron2.engine import SimpleTrainer, hooks +from detectron2.evaluation import DatasetEvaluators, verify_results +from detectron2.utils import comm +from torch import nn + +from modelscope.metainfo import Trainers +from modelscope.models.base import Model, TorchModel +from modelscope.trainers.base import BaseTrainer +from modelscope.trainers.builder import TRAINERS +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile +from modelscope.utils.logger import get_logger + + +class DefaultTrainer(SimpleTrainer): + + def __init__(self, model, cfg): + + from collections import OrderedDict + from fvcore.nn.precise_bn import get_bn_modules + from torch.nn.parallel import DistributedDataParallel + + from detectron2.data.build import build_detection_train_loader, build_detection_test_loader + from detectron2.solver.build import build_optimizer, build_lr_scheduler + from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer + from detectron2.utils.logger import setup_logger + + setup_logger() + + optimizer = build_optimizer(cfg, model) + data_loader = build_detection_train_loader(cfg) + + if comm.get_world_size() > 1: + model = DistributedDataParallel( + model, + device_ids=[comm.get_local_rank()], + broadcast_buffers=False, + find_unused_parameters=True) + super().__init__(model, data_loader, optimizer) + + self.scheduler = build_lr_scheduler(cfg, optimizer) + + self.checkpointer = DetectionCheckpointer( + model, + cfg.OUTPUT_DIR, + optimizer=optimizer, + scheduler=self.scheduler, + ) + self.start_iter = 0 + self.max_iter = cfg.SOLVER.MAX_ITER + self.cfg = cfg + + self.register_hooks(self.build_hooks()) + + def resume_or_load(self, resume=True): + # The checkpoint stores the training iteration that just finished, thus we start + # at the next iteration (or iter zero if there's no checkpoint). + self.start_iter = ( + self.checkpointer.resume_or_load( + self.cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1) + + 1) + + def build_hooks(self): + """ + Build a list of default hooks, including timing, evaluation, + checkpointing, lr scheduling, precise BN, writing events. + + Returns: + list[HookBase]: + """ + cfg = self.cfg.clone() + cfg.defrost() + cfg.DATALOADER.NUM_WORKERS = 0 + + ret = [ + hooks.IterationTimer(), + hooks.LRScheduler(self.optimizer, self.scheduler), + hooks.PreciseBN( + cfg.TEST.EVAL_PERIOD, + self.model, + build_detection_train_loader(cfg), + cfg.TEST.PRECISE_BN.NUM_ITER, + ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) + else None, + ] + + if comm.is_main_process(): + ret.append( + hooks.PeriodicCheckpointer(self.checkpointer, + cfg.SOLVER.CHECKPOINT_PERIOD)) + + def test_and_save_results(): + self._last_eval_results = self.test(self.cfg, self.model) + return self._last_eval_results + + ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) + + if comm.is_main_process(): + ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) + return ret + + def build_writers(self): + from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter + + return [ + CommonMetricPrinter(self.max_iter), + JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, 'metrics.json')), + TensorboardXWriter(self.cfg.OUTPUT_DIR), + ] + + def train(self): + """ + Run training. + + Returns: + OrderedDict of results, if evaluation is enabled. Otherwise None. + """ + super().train(self.start_iter, self.max_iter) + if hasattr(self, '_last_eval_results') and comm.is_main_process(): + verify_results(self.cfg, self._last_eval_results) + return self._last_eval_results + + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + from detectron2.data import MetadataCatalog + + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, 'inference') + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type == 'coco': + from detectron2.evaluation import COCOEvaluator + evaluator_list.append( + COCOEvaluator(dataset_name, True, output_folder)) + if evaluator_type == 'pascal_voc': + from detectron2.evaluation import PascalVOCDetectionEvaluator + return PascalVOCDetectionEvaluator(dataset_name) + if len(evaluator_list) == 0: + raise NotImplementedError( + 'no Evaluator for the dataset {} with the type {}'.format( + dataset_name, evaluator_type)) + if len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + @classmethod + def test(cls, cfg, model, evaluators=None): + from detectron2.engine.defaults import DefaultTrainer as _DefaultTrainer + _DefaultTrainer.build_evaluator = cls.build_evaluator + + return _DefaultTrainer.test(cfg, model, evaluators) + + +@TRAINERS.register_module(module_name=Trainers.image_fewshot_detection) +class ImageDefrcnFewshotTrainer(BaseTrainer): + + def __init__(self, + model: Optional[Union[TorchModel, nn.Module, str]] = None, + cfg_file: Optional[str] = None, + arg_parse_fn: Optional[Callable] = None, + model_revision: Optional[str] = DEFAULT_MODEL_REVISION, + seed: int = 0, + cfg_modify_fn: Optional[Callable] = None, + **kwargs): + + if isinstance(model, str): + self.model_dir = self.get_or_download_model_dir( + model, model_revision) + if cfg_file is None: + cfg_file = os.path.join(self.model_dir, + ModelFile.CONFIGURATION) + else: + assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!' + self.model_dir = os.path.dirname(cfg_file) + + super().__init__(cfg_file, arg_parse_fn) + + if cfg_modify_fn is not None: + self.cfg = cfg_modify_fn(self.cfg) + + self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) + + if isinstance(model, (TorchModel, nn.Module)): + self.model = model + else: + self.model = self.build_model(**kwargs) + + self.model_cfg = self.model.get_model_cfg() + + if 'datasets_train' in kwargs: + self.model_cfg.merge_from_list( + ['DATASETS.TRAIN', kwargs['datasets_train']]) + if 'datasets_test' in kwargs: + self.model_cfg.merge_from_list( + ['DATASETS.TEST', kwargs['datasets_test']]) + if 'work_dir' in kwargs: + self.model_cfg.merge_from_list(['OUTPUT_DIR', kwargs['work_dir']]) + + if not os.path.exists(self.model_cfg.OUTPUT_DIR): + os.makedirs(self.model_cfg.OUTPUT_DIR) + + self.model_cfg.freeze() + + self.data_dir = kwargs.get('data_dir', None) + self.data_type = kwargs.get('data_type', 'pascal_voc') + + self.register_data(self.data_type, self.data_dir) + + self.trainer = DefaultTrainer(self.model, self.model_cfg) + + def train(self, *args, **kwargs): + self.trainer.resume_or_load() + self.trainer.train() + + def evaluate(self, checkpoint_path: str, *args, **kwargs): + from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer + + DetectionCheckpointer( + self.model, + save_dir=self.model_cfg.OUTPUT_DIR).resume_or_load(checkpoint_path) + metric_values = DefaultTrainer.test(self.model_cfg, self.model) + return metric_values + + def build_model(self, *args, **kwargs) -> Union[nn.Module, TorchModel]: + model = Model.from_pretrained(self.model_dir, **kwargs) + if not isinstance(model, nn.Module) and hasattr(model, 'model'): + return model.model + elif isinstance(model, nn.Module): + return model + + @classmethod + def register_data(cls, data_type='pascal_voc', data_dir=None): + + if data_type == 'pascal_voc': + from modelscope.models.cv.image_defrcn_fewshot.utils.voc_register import register_all_voc + if data_dir: + register_all_voc(data_dir) + else: + register_all_voc() + else: + raise NotImplementedError( + 'no {} dataset was registered'.format(data_type)) + + @classmethod + def model_surgery(cls, + src_path, + save_dir, + data_type='pascal_voc', + method='remove'): + + assert method in ['remove', + 'randinit'], '{} not implemented'.format(method) + + def _surgery(param_name, is_weight, tar_size, ckpt): + weight_name = param_name + ('.weight' if is_weight else '.bias') + pretrained_weight = ckpt['model'][weight_name] + prev_cls = pretrained_weight.size(0) + if 'cls_score' in param_name: + prev_cls -= 1 + if is_weight: + feat_size = pretrained_weight.size(1) + new_weight = torch.rand((tar_size, feat_size)) + torch.nn.init.normal_(new_weight, 0, 0.01) + else: + new_weight = torch.zeros(tar_size) + + new_weight[:prev_cls] = pretrained_weight[:prev_cls] + if 'cls_score' in param_name: + new_weight[-1] = pretrained_weight[-1] # bg class + ckpt['model'][weight_name] = new_weight + + if data_type == 'pascal_voc': + TAR_SIZE = 20 + params_name = [ + 'model.roi_heads.box_predictor.cls_score', + 'model.roi_heads.box_predictor.bbox_pred' + ] + + save_name = 'model_reset_' + ('remove' if method == 'remove' else + 'surgery') + '.pth' + save_path = os.path.join(save_dir, save_name) + os.makedirs(save_dir, exist_ok=True) + + ckpt = torch.load(src_path) + + if 'scheduler' in ckpt: + del ckpt['scheduler'] + if 'optimizer' in ckpt: + del ckpt['optimizer'] + if 'iteration' in ckpt: + ckpt['iteration'] = 0 + + if method == 'remove': + for param_name in params_name: + del ckpt['model'][param_name + '.weight'] + if param_name + '.bias' in ckpt['model']: + del ckpt['model'][param_name + '.bias'] + else: + tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4] + for idx, (param_name, + tar_size) in enumerate(zip(params_name, tar_sizes)): + _surgery(param_name, True, tar_size, ckpt) + _surgery(param_name, False, tar_size, ckpt) + + torch.save(ckpt, save_path) + else: + NotImplementedError( + '{} dataset does not supported'.format(data_type)) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 91836011..404ec548 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -46,6 +46,7 @@ class CVTasks(object): image_object_detection = 'image-object-detection' video_object_detection = 'video-object-detection' + image_fewshot_detection = 'image-fewshot-detection' image_segmentation = 'image-segmentation' semantic_segmentation = 'semantic-segmentation' diff --git a/tests/pipelines/test_image_defrcn_fewshot.py b/tests/pipelines/test_image_defrcn_fewshot.py new file mode 100644 index 00000000..4658206a --- /dev/null +++ b/tests/pipelines/test_image_defrcn_fewshot.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import subprocess +import sys +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + + +class ImageDefrcnFewShotTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + logger.info('start install detectron2-0.3') + cmd = [ + sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f', + 'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html' + ] + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + logger.info('install detectron2-0.3 finished') + + self.task = Tasks.image_fewshot_detection + self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn' + self.image = 'data/test/images/image_voc2007_000001.jpg' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + pipeline_defrcn = pipeline(task=self.task, model=model) + print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS]) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_defrcn = pipeline(task=self.task, model=self.model_id) + print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS]) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_default_model(self): + pipeline_defrcn = pipeline(task=self.task) + print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS]) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_by_direct_model_download(self): + cache_path = snapshot_download(self.model_id) + pipeline_defrcn = pipeline(self.task, model=cache_path) + print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS]) + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/run_config.yaml b/tests/run_config.yaml index 1b6b100a..efc216de 100644 --- a/tests/run_config.yaml +++ b/tests/run_config.yaml @@ -49,6 +49,7 @@ isolated: # test cases that may require excessive anmount of GPU memory or run - test_kws_nearfield_trainer.py - test_gpt3_text_generation.py - test_ddcolor_image_colorization.py + - test_image_defrcn_fewshot_trainer.py - test_image_deblur_trainer.py envs: diff --git a/tests/trainers/test_image_defrcn_fewshot_trainer.py b/tests/trainers/test_image_defrcn_fewshot_trainer.py new file mode 100644 index 00000000..d007e23c --- /dev/null +++ b/tests/trainers/test_image_defrcn_fewshot_trainer.py @@ -0,0 +1,70 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import subprocess +import sys +import tempfile +import unittest + +from modelscope.hub.utils.utils import get_cache_dir +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.constant import DownloadMode +from modelscope.utils.test_utils import test_level + + +class TestImageDefrcnFewShotTrainer(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + cmd = [ + sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f', + 'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html' + ] + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn' + + data_voc = MsDataset.load( + dataset_name='VOC_fewshot', + namespace='shimin2023', + split='train', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS) + self.data_dir = os.path.join( + data_voc.config_kwargs['split_config']['train'], 'data') + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer(self): + + split = 1 + kwargs = dict( + model=self.model_id, + data_dir=self.data_dir, + work_dir=self.tmp_dir, + model_weights=os.path.join(get_cache_dir(), self.model_id, + 'ImageNetPretrained/MSRA/R-101.pkl'), + data_type='pascal_voc', + config_path='defrcn_det_r101_base{}.yaml'.format(split), + datasets_train=('voc_2007_trainval_base{}'.format(split), + 'voc_2012_trainval_base{}'.format(split)), + datasets_test=('voc_2007_test_base{}'.format(split), )) + trainer = build_trainer( + name=Trainers.image_fewshot_detection, default_args=kwargs) + trainer.train() + + results_files = os.listdir(self.tmp_dir) + self.assertIn('metrics.json', results_files) + self.assertIn('model_final.pth', results_files) + + +if __name__ == '__main__': + unittest.main()