add cv/image-defrcn-fewshot-detection

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11364804

* add model defrcn-fewshot-detection

* add requirements check
This commit is contained in:
shimin.ysm
2023-01-12 12:48:38 +00:00
committed by wenmeng.zwm
parent 8cd79a4fea
commit f7930c23a0
24 changed files with 1965 additions and 16 deletions

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3f0bdad67d01aa452929683b74a124a2926b6bce534c85f3ee0f00e20eeacab0
size 78771

View File

@@ -76,6 +76,7 @@ class Models(object):
image_casmvs_depth_estimation = 'image-casmvs-depth-estimation'
vop_retrieval_model = 'vop-retrieval-model'
ddcolor = 'ddcolor'
defrcn = 'defrcn'
image_face_fusion = 'image-face-fusion'
# EasyCV models
@@ -296,6 +297,7 @@ class Pipelines(object):
image_multi_view_depth_estimation = 'image-multi-view-depth-estimation'
vop_retrieval = 'vop-video-text-retrieval'
ddcolor_image_colorization = 'ddcolor-image-colorization'
image_fewshot_detection = 'image-fewshot-detection'
image_face_fusion = 'image-face-fusion'
# nlp tasks
@@ -416,6 +418,7 @@ class Trainers(object):
referring_video_object_segmentation = 'referring-video-object-segmentation'
image_classification_team = 'image-classification-team'
image_classification = 'image-classification'
image_fewshot_detection = 'image-fewshot-detection'
# nlp trainers
bert_sentiment_analysis = 'bert-sentiment-analysis'

View File

@@ -5,20 +5,20 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
body_3d_keypoints, cartoon, cmdssl_video_embedding,
crowd_counting, face_2d_keypoints, face_detection,
face_generation, human_wholebody_keypoint, image_classification,
image_color_enhance, image_colorization, image_denoise,
image_inpainting, image_instance_segmentation, image_matching,
image_mvs_depth_estimation, image_panoptic_segmentation,
image_portrait_enhancement, image_reid_person,
image_semantic_segmentation, image_to_image_generation,
image_to_image_translation, language_guided_video_summarization,
movie_scene_segmentation, object_detection,
panorama_depth_estimation, pointcloud_sceneflow_estimation,
product_retrieval_embedding, realtime_object_detection,
referring_video_object_segmentation, salient_detection,
shop_segmentation, super_resolution, video_frame_interpolation,
video_object_segmentation, video_single_object_tracking,
video_stabilization, video_summarization,
video_super_resolution, virual_tryon, vision_middleware,
vop_retrieval)
image_color_enhance, image_colorization, image_defrcn_fewshot,
image_denoise, image_inpainting, image_instance_segmentation,
image_matching, image_mvs_depth_estimation,
image_panoptic_segmentation, image_portrait_enhancement,
image_reid_person, image_semantic_segmentation,
image_to_image_generation, image_to_image_translation,
language_guided_video_summarization, movie_scene_segmentation,
object_detection, panorama_depth_estimation,
pointcloud_sceneflow_estimation, product_retrieval_embedding,
realtime_object_detection, referring_video_object_segmentation,
salient_detection, shop_segmentation, super_resolution,
video_frame_interpolation, video_object_segmentation,
video_single_object_tracking, video_stabilization,
video_summarization, video_super_resolution, virual_tryon,
vision_middleware, vop_retrieval)
# yapf: enable

View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .defrcn_for_fewshot import DeFRCNForFewShot
else:
_import_structure = {'defrcn_for_fewshot': ['DeFRCNForFewShot']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,80 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict
import torch
from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .models.defaults_config import _C
from .models.defrcn import DeFRCN
from .utils.requirements_check import requires_version
logger = get_logger()
__all__ = ['DeFRCNForFewShot']
@MODELS.register_module(
Tasks.image_fewshot_detection, module_name=Models.defrcn)
class DeFRCNForFewShot(TorchModel):
""" Few-shot object detection model DeFRCN. The model requires detectron2-0.3 and pytorch-1.11.
Model config params mainly from detectron2, you can use detectron2 config file to initialize model.
Detail configs can be visited on detectron2.config.defaults and .models.defaults_config.
"""
def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the few-shot defrcn model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
requires_version()
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir
self.config = Config.from_file(
os.path.join(self.model_dir, ModelFile.CONFIGURATION))
if 'config_path' in kwargs:
self.config.merge_from_dict(
{'model.config_path': kwargs['config_path']})
self.model_cfg = _C.clone()
self.model_cfg.merge_from_file(
os.path.join(model_dir, self.config.model.config_path))
if 'model_weights' in kwargs:
self.model_cfg.merge_from_list(
['MODEL.WEIGHTS', kwargs['model_weights']])
self.model_cfg.freeze()
self.model = DeFRCN(self.model_cfg)
def forward(self, inputs) -> Any:
"""return the result by the model
Args:
inputs (list): the preprocessed data
Returns:
Any: results
"""
if self.training:
return self.model.forward(inputs)
else:
return self.model.inference(inputs)
def inference(self, input: Dict[str, Any]) -> Any:
with torch.no_grad():
results = self.model([input])
return results[0] if len(results) > 0 else None
def get_model_cfg(self):
return self.model_cfg

View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .defrcn import DeFRCN
else:
_import_structure = {'defrcn': ['DeFRCN']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,38 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py
from detectron2.config.defaults import _C
_CC = _C
# ----------- Backbone ----------- #
_CC.MODEL.BACKBONE.FREEZE = False
_CC.MODEL.BACKBONE.FREEZE_AT = 3
# ------------- RPN -------------- #
_CC.MODEL.RPN.FREEZE = False
_CC.MODEL.RPN.ENABLE_DECOUPLE = False
_CC.MODEL.RPN.BACKWARD_SCALE = 1.0
# ------------- ROI -------------- #
_CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads'
_CC.MODEL.ROI_HEADS.FREEZE_FEAT = False
_CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False
_CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0
_CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers'
_CC.MODEL.ROI_HEADS.CLS_DROPOUT = False
_CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8
_CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7 # for faster
# ------------- TEST ------------- #
_CC.TEST.PCB_ENABLE = False
_CC.TEST.PCB_MODELTYPE = 'resnet' # res-like
_CC.TEST.PCB_MODELPATH = ''
_CC.TEST.PCB_ALPHA = 0.50
_CC.TEST.PCB_UPPER = 1.0
_CC.TEST.PCB_LOWER = 0.05
# ------------ Other ------------- #
_CC.SOLVER.WEIGHT_DECAY = 5e-5
_CC.MUTE_HEADER = True

View File

@@ -0,0 +1,179 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
import os
from typing import Dict
import torch
from detectron2.layers import ShapeSpec
from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
from detectron2.modeling.backbone.resnet import build_resnet_backbone
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.proposal_generator.rpn import RPN, StandardRPNHead
from detectron2.structures import ImageList
from torch import nn
from .gdl import AffineLayer, decouple_layer
from .roi_heads import Res5ROIHeads
class DeFRCN(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.device = torch.device(cfg.MODEL.DEVICE)
self.backbone = build_resnet_backbone(
cfg, ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
self._SHAPE_ = self.backbone.output_shape()
rpn_config = DeFRCN.from_rpn_config(cfg, self._SHAPE_)
self.proposal_generator = RPN(**rpn_config)
self.roi_heads = Res5ROIHeads(cfg, self._SHAPE_)
self.normalizer = self.normalize_fn()
self.affine_rpn = AffineLayer(
num_channels=self._SHAPE_['res4'].channels, bias=True)
self.affine_rcnn = AffineLayer(
num_channels=self._SHAPE_['res4'].channels, bias=True)
self.to(self.device)
if cfg.MODEL.BACKBONE.FREEZE:
for p in self.backbone.parameters():
p.requires_grad = False
if cfg.MODEL.RPN.FREEZE:
for p in self.proposal_generator.parameters():
p.requires_grad = False
if cfg.MODEL.ROI_HEADS.FREEZE_FEAT:
for p in self.roi_heads.res5.parameters():
p.requires_grad = False
def forward(self, batched_inputs):
if not self.training:
return self.inference(batched_inputs)
assert 'instances' in batched_inputs[0]
gt_instances = [x['instances'].to(self.device) for x in batched_inputs]
proposal_losses, detector_losses, _, _ = self._forward_once_(
batched_inputs, gt_instances)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(self, batched_inputs):
assert not self.training
_, _, results, image_sizes = self._forward_once_(batched_inputs, None)
processed_results = []
for r, input, image_size in zip(results, batched_inputs, image_sizes):
height = input.get('height', image_size[0])
width = input.get('width', image_size[1])
r = detector_postprocess(r, height, width)
processed_results.append({'instances': r})
return processed_results
def _forward_once_(self, batched_inputs, gt_instances=None):
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
features_de_rpn = features
if self.cfg.MODEL.RPN.ENABLE_DECOUPLE:
scale = self.cfg.MODEL.RPN.BACKWARD_SCALE
features_de_rpn = {
k: self.affine_rpn(decouple_layer(features[k], scale))
for k in features
}
proposals, proposal_losses = self.proposal_generator(
images, features_de_rpn, gt_instances)
features_de_rcnn = features
if self.cfg.MODEL.ROI_HEADS.ENABLE_DECOUPLE:
scale = self.cfg.MODEL.ROI_HEADS.BACKWARD_SCALE
features_de_rcnn = {
k: self.affine_rcnn(decouple_layer(features[k], scale))
for k in features
}
results, detector_losses = self.roi_heads(images, features_de_rcnn,
proposals, gt_instances)
return proposal_losses, detector_losses, results, images.image_sizes
def preprocess_image(self, batched_inputs):
images = [x['image'].to(self.device) for x in batched_inputs]
images = [self.normalizer(x) for x in images]
images = ImageList.from_tensors(images,
self.backbone.size_divisibility)
return images
def normalize_fn(self):
assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
pixel_mean = (
torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).to(self.device).view(
num_channels, 1, 1))
pixel_std = (
torch.Tensor(self.cfg.MODEL.PIXEL_STD).to(self.device).view(
num_channels, 1, 1))
return lambda x: (x - pixel_mean) / pixel_std
@classmethod
def from_rpn_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
in_features = cfg.MODEL.RPN.IN_FEATURES
ret = {
'in_features':
in_features,
'min_box_size':
cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
'nms_thresh':
cfg.MODEL.RPN.NMS_THRESH,
'batch_size_per_image':
cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
'positive_fraction':
cfg.MODEL.RPN.POSITIVE_FRACTION,
'loss_weight': {
'loss_rpn_cls':
cfg.MODEL.RPN.LOSS_WEIGHT,
'loss_rpn_loc':
cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
},
'anchor_boundary_thresh':
cfg.MODEL.RPN.BOUNDARY_THRESH,
'box2box_transform':
Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
'box_reg_loss_type':
cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
'smooth_l1_beta':
cfg.MODEL.RPN.SMOOTH_L1_BETA,
}
ret['pre_nms_topk'] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN,
cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
ret['post_nms_topk'] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN,
cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
# ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
anchor_cfg = DefaultAnchorGenerator.from_config(
cfg, [input_shape[f] for f in in_features])
ret['anchor_generator'] = DefaultAnchorGenerator(**anchor_cfg)
ret['anchor_matcher'] = Matcher(
cfg.MODEL.RPN.IOU_THRESHOLDS,
cfg.MODEL.RPN.IOU_LABELS,
allow_low_quality_matches=True)
rpn_head_cfg = {
'in_channels':
[s.channels for s in [input_shape[f] for f in in_features]][0],
'num_anchors':
ret['anchor_generator'].num_anchors[0],
'box_dim':
ret['anchor_generator'].box_dim
}
ret['head'] = StandardRPNHead(**rpn_head_cfg)
return ret

View File

@@ -0,0 +1,274 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
import numpy as np
import torch
from detectron2.layers import batched_nms, cat
from detectron2.modeling.roi_heads.fast_rcnn import \
fast_rcnn_inference_single_image
from detectron2.utils.events import get_event_storage
from fvcore.nn import smooth_l1_loss
from torch import nn
from torch.nn import functional as F
def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh,
topk_per_image):
result_per_image = [
fast_rcnn_inference_single_image(
boxes_per_image,
scores_per_image,
image_shape,
score_thresh,
nms_thresh,
topk_per_image,
) for scores_per_image, boxes_per_image, image_shape in zip(
scores, boxes, image_shapes)
]
return tuple(list(x) for x in zip(*result_per_image))
class FastRCNNOutputs(object):
"""
A class that stores information about outputs of a Fast R-CNN head.
"""
def __init__(
self,
box2box_transform,
pred_class_logits,
pred_proposal_deltas,
proposals,
smooth_l1_beta,
):
"""
Args:
box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
box2box transform instance for proposal-to-detection transformations.
pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
logits for all R predicted object instances.
Each row corresponds to a predicted object instance.
pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
class-specific or class-agnostic regression. It stores the predicted deltas that
transform proposals into final box detections.
B is the box dimension (4 or 5).
When B is 4, each row is [dx, dy, dw, dh (, ....)].
When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
proposals (list[Instances]): A list of N Instances, where Instances i stores the
proposals for image i, in the field "proposal_boxes".
When training, each Instances must have ground-truth labels
stored in the field "gt_classes" and "gt_boxes".
smooth_l1_beta (float): The transition point between L1 and L2 loss in
the smooth L1 loss function. When set to 0, the loss becomes L1. When
set to +inf, the loss becomes constant 0.
"""
self.box2box_transform = box2box_transform
self.num_preds_per_image = [len(p) for p in proposals]
self.pred_class_logits = pred_class_logits
self.pred_proposal_deltas = pred_proposal_deltas
self.smooth_l1_beta = smooth_l1_beta
box_type = type(proposals[0].proposal_boxes)
# cat(..., dim=0) concatenates over all images in the batch
self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
assert (not self.proposals.tensor.requires_grad
), 'Proposals should not require gradients!'
self.image_shapes = [x.image_size for x in proposals]
# The following fields should exist only when training.
if proposals[0].has('gt_boxes'):
self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
assert proposals[0].has('gt_classes')
self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
def _log_accuracy(self):
"""
Log the accuracy metrics to EventStorage.
"""
num_instances = self.gt_classes.numel()
pred_classes = self.pred_class_logits.argmax(dim=1)
bg_class_ind = self.pred_class_logits.shape[1] - 1
fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)
num_fg = fg_inds.nonzero().numel()
fg_gt_classes = self.gt_classes[fg_inds]
fg_pred_classes = pred_classes[fg_inds]
num_false_negative = ((
fg_pred_classes == bg_class_ind).nonzero().numel())
num_accurate = (pred_classes == self.gt_classes).nonzero().numel()
fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
storage = get_event_storage()
storage.put_scalar('fast_rcnn/cls_accuracy',
num_accurate / num_instances)
if num_fg > 0:
storage.put_scalar('fast_rcnn/fg_cls_accuracy',
fg_num_accurate / num_fg)
storage.put_scalar('fast_rcnn/false_negative',
num_false_negative / num_fg)
def softmax_cross_entropy_loss(self):
"""
Compute the softmax cross entropy loss for box classification.
Returns:
scalar Tensor
"""
self._log_accuracy()
return F.cross_entropy(
self.pred_class_logits, self.gt_classes, reduction='mean')
def smooth_l1_loss(self):
"""
Compute the smooth L1 loss for box regression.
Returns:
scalar Tensor
"""
gt_proposal_deltas = self.box2box_transform.get_deltas(
self.proposals.tensor, self.gt_boxes.tensor)
box_dim = gt_proposal_deltas.size(1) # 4 or 5
cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
device = self.pred_proposal_deltas.device
bg_class_ind = self.pred_class_logits.shape[1] - 1
fg_inds = torch.nonzero((self.gt_classes >= 0)
& (self.gt_classes < bg_class_ind)).squeeze(1)
if cls_agnostic_bbox_reg:
# pred_proposal_deltas only corresponds to foreground class for agnostic
gt_class_cols = torch.arange(box_dim, device=device)
else:
fg_gt_classes = self.gt_classes[fg_inds]
gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
box_dim, device=device)
loss_box_reg = smooth_l1_loss(
self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
gt_proposal_deltas[fg_inds],
self.smooth_l1_beta,
reduction='sum',
)
loss_box_reg = loss_box_reg / self.gt_classes.numel()
return loss_box_reg
def losses(self):
"""
Compute the default losses for box head in Fast(er) R-CNN,
with softmax cross entropy loss and smooth L1 loss.
Returns:
A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg".
"""
return {
'loss_cls': self.softmax_cross_entropy_loss(),
'loss_box_reg': self.smooth_l1_loss(),
}
def predict_boxes(self):
"""
Returns:
list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes
for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
the number of predicted objects for image i and B is the box dimension (4 or 5)
"""
num_pred = len(self.proposals)
B = self.proposals.tensor.shape[1]
K = self.pred_proposal_deltas.shape[1] // B
boxes = self.box2box_transform.apply_deltas(
self.pred_proposal_deltas.view(num_pred * K, B),
self.proposals.tensor.unsqueeze(1).expand(num_pred, K,
B).reshape(-1, B),
)
return boxes.view(num_pred, K * B).split(
self.num_preds_per_image, dim=0)
def predict_probs(self):
"""
Returns:
list[Tensor]: A list of Tensors of predicted class probabilities for each image.
Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
for image i.
"""
probs = F.softmax(self.pred_class_logits, dim=-1)
return probs.split(self.num_preds_per_image, dim=0)
def inference(self, score_thresh, nms_thresh, topk_per_image):
"""
Args:
score_thresh (float): same as fast_rcnn_inference.
nms_thresh (float): same as fast_rcnn_inference.
topk_per_image (int): same as fast_rcnn_inference.
Returns:
list[Instances]: same as fast_rcnn_inference.
list[Tensor]: same as fast_rcnn_inference.
"""
boxes = self.predict_boxes()
scores = self.predict_probs()
image_shapes = self.image_shapes
return fast_rcnn_inference(
boxes,
scores,
image_shapes,
score_thresh,
nms_thresh,
topk_per_image,
)
class FastRCNNOutputLayers(nn.Module):
"""
Two linear layers for predicting Fast R-CNN outputs:
(1) proposal-to-detection box regression deltas
(2) classification scores
"""
def __init__(self,
cfg,
input_size,
num_classes,
cls_agnostic_bbox_reg,
box_dim=4):
"""
Args:
cfg: config
input_size (int): channels, or (channels, height, width)
num_classes (int): number of foreground classes
cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
box_dim (int): the dimension of bounding boxes.
Example box dimensions: 4 for regular XYXY boxes and 5 for rotated XYWHA boxes
"""
super(FastRCNNOutputLayers, self).__init__()
if not isinstance(input_size, int):
input_size = np.prod(input_size)
# The prediction layer for num_classes foreground classes and one
# background class
self.cls_score = nn.Linear(input_size, num_classes + 1)
num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
nn.init.normal_(self.cls_score.weight, std=0.01)
nn.init.normal_(self.bbox_pred.weight, std=0.001)
for b in [self.cls_score, self.bbox_pred]:
nn.init.constant_(b.bias, 0)
self._do_cls_dropout = cfg.MODEL.ROI_HEADS.CLS_DROPOUT
self._dropout_ratio = cfg.MODEL.ROI_HEADS.DROPOUT_RATIO
def forward(self, x):
if x.dim() > 2:
x = torch.flatten(x, start_dim=1)
proposal_deltas = self.bbox_pred(x)
if self._do_cls_dropout:
x = F.dropout(x, self._dropout_ratio, training=self.training)
scores = self.cls_score(x)
return scores, proposal_deltas

View File

@@ -0,0 +1,43 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/gdl.py
import torch
import torch.nn as nn
from torch.autograd import Function
class GradientDecoupleLayer(Function):
@staticmethod
def forward(ctx, x, _lambda):
ctx._lambda = _lambda
return x
@staticmethod
def backward(ctx, grad_output):
grad_output = grad_output * ctx._lambda
return grad_output, None
class AffineLayer(nn.Module):
def __init__(self, num_channels, bias=False):
super(AffineLayer, self).__init__()
weight = torch.FloatTensor(1, num_channels, 1, 1).fill_(1)
self.weight = nn.Parameter(weight, requires_grad=True)
self.bias = None
if bias:
bias = torch.FloatTensor(1, num_channels, 1, 1).fill_(0)
self.bias = nn.Parameter(bias, requires_grad=True)
def forward(self, X):
out = X * self.weight.expand_as(X)
if self.bias is not None:
out = out + self.bias.expand_as(X)
return out
def decouple_layer(x, _lambda):
return GradientDecoupleLayer.apply(x, _lambda)

View File

@@ -0,0 +1,302 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/roi_heads/roi_heads.py
from typing import Dict
import numpy as np
import torch
from detectron2.layers import ShapeSpec
from detectron2.modeling.backbone.resnet import BottleneckBlock, make_stage
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.proposal_generator.proposal_utils import \
add_ground_truth_to_proposals
from detectron2.modeling.roi_heads import select_foreground_proposals
from detectron2.modeling.sampling import subsample_labels
from detectron2.structures import Boxes, Instances, pairwise_iou
from detectron2.utils.events import get_event_storage
from torch import nn
from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
class ROIHeads(torch.nn.Module):
"""
ROIHeads perform all per-region computation in an R-CNN.
It contains logic of cropping the regions, extract per-region features,
and make per-region predictions.
It can have many variants, implemented as subclasses of this class.
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
super(ROIHeads, self).__init__()
# fmt: off
self.batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE
self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
self.proposal_append_gt = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT
self.feature_strides = {k: v.stride for k, v in input_shape.items()}
self.feature_channels = {k: v.channels for k, v in input_shape.items()}
self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
# fmt: on
# Matcher to assign box proposals to gt boxes
self.proposal_matcher = Matcher(
cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
cfg.MODEL.ROI_HEADS.IOU_LABELS,
allow_low_quality_matches=False,
)
# Box2BoxTransform for bounding box regression
self.box2box_transform = Box2BoxTransform(
weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
def _sample_proposals(self, matched_idxs, matched_labels, gt_classes):
"""
Based on the matching between N proposals and M groundtruth,
sample the proposals and set their classification labels.
Args:
matched_idxs (Tensor): a vector of length N, each is the best-matched
gt index in [0, M) for each proposal.
matched_labels (Tensor): a vector of length N, the matcher's label
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
gt_classes (Tensor): a vector of length M.
Returns:
Tensor: a vector of indices of sampled proposals. Each is in [0, N).
Tensor: a vector of the same length, the classification label for
each sampled proposal. Each sample is labeled as either a category in
[0, num_classes) or the background (num_classes).
"""
has_gt = gt_classes.numel() > 0
# Get the corresponding GT for each proposal
if has_gt:
gt_classes = gt_classes[matched_idxs]
# Label unmatched proposals (0 label from matcher) as background (label=num_classes)
gt_classes[matched_labels == 0] = self.num_classes
# Label ignore proposals (-1 label)
gt_classes[matched_labels == -1] = -1
else:
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
gt_classes,
self.batch_size_per_image,
self.positive_sample_fraction,
self.num_classes,
)
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
return sampled_idxs, gt_classes[sampled_idxs]
@torch.no_grad()
def label_and_sample_proposals(self, proposals, targets):
"""
Prepare some proposals to be used to train the ROI heads.
It performs box matching between `proposals` and `targets`, and assigns
training labels to the proposals.
It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
with a fraction of positives that is no larger than `self.positive_sample_fraction.
Args:
See :meth:`ROIHeads.forward`
Returns:
list[Instances]:
length `N` list of `Instances`s containing the proposals
sampled for training. Each `Instances` has the following fields:
- proposal_boxes: the proposal boxes
- gt_boxes: the ground-truth box that the proposal is assigned to
(this is only meaningful if the proposal has a label > 0; if label = 0
then the ground-truth box is random)
Other fields such as "gt_classes" that's included in `targets`.
"""
gt_boxes = [x.gt_boxes for x in targets]
if self.proposal_append_gt:
proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
proposals_with_gt = []
num_fg_samples = []
num_bg_samples = []
for proposals_per_image, targets_per_image in zip(proposals, targets):
has_gt = len(targets_per_image) > 0
match_quality_matrix = pairwise_iou(
targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
matched_idxs, matched_labels = self.proposal_matcher(
match_quality_matrix)
sampled_idxs, gt_classes = self._sample_proposals(
matched_idxs, matched_labels, targets_per_image.gt_classes)
# Set target attributes of the sampled proposals:
proposals_per_image = proposals_per_image[sampled_idxs]
proposals_per_image.gt_classes = gt_classes
# We index all the attributes of targets that start with "gt_"
# and have not been added to proposals yet (="gt_classes").
if has_gt:
sampled_targets = matched_idxs[sampled_idxs]
for (
trg_name,
trg_value,
) in targets_per_image.get_fields().items():
if trg_name.startswith(
'gt_') and not proposals_per_image.has(trg_name):
proposals_per_image.set(trg_name,
trg_value[sampled_targets])
else:
gt_boxes = Boxes(
targets_per_image.gt_boxes.tensor.new_zeros(
(len(sampled_idxs), 4)))
proposals_per_image.gt_boxes = gt_boxes
num_bg_samples.append(
(gt_classes == self.num_classes).sum().item())
num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
proposals_with_gt.append(proposals_per_image)
# Log the number of fg/bg samples that are selected for training ROI heads
storage = get_event_storage()
storage.put_scalar('roi_head/num_fg_samples', np.mean(num_fg_samples))
storage.put_scalar('roi_head/num_bg_samples', np.mean(num_bg_samples))
return proposals_with_gt
def forward(self, images, features, proposals, targets=None):
"""
Args:
images (ImageList):
features (dict[str: Tensor]): input data as a mapping from feature
map name to tensor. Axis 0 represents the number of images `N` in
the input data; axes 1-3 are channels, height, and width, which may
vary between feature maps (e.g., if a feature pyramid is used).
proposals (list[Instances]): length `N` list of `Instances`s. The i-th
`Instances` contains object proposals for the i-th input image,
with fields "proposal_boxes" and "objectness_logits".
targets (list[Instances], optional): length `N` list of `Instances`s. The i-th
`Instances` contains the ground-truth per-instance annotations
for the i-th input image. Specify `targets` during training only.
It may have the following fields:
- gt_boxes: the bounding box of each instance.
- gt_classes: the label for each instance with a category ranging in [0, #class].
Returns:
results (list[Instances]): length `N` list of `Instances`s containing the
detected instances. Returned during inference only; may be []
during training.
losses (dict[str: Tensor]): mapping from a named loss to a tensor
storing the loss. Used during training only.
"""
raise NotImplementedError()
class Res5ROIHeads(ROIHeads):
"""
The ROIHeads in a typical "C4" R-CNN model, where the heads share the
cropping and the per-region feature computation by a Res5 block.
"""
def __init__(self, cfg, input_shape):
super().__init__(cfg, input_shape)
assert len(self.in_features) == 1
# fmt: off
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], )
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
self.pooler = ROIPooler(
output_size=pooler_resolution,
scales=pooler_scales,
sampling_ratio=sampling_ratio,
pooler_type=pooler_type,
)
self.res5, out_channels = self._build_res5_block(cfg)
self.box_predictor = FastRCNNOutputLayers(cfg, out_channels,
self.num_classes,
self.cls_agnostic_bbox_reg)
def _build_res5_block(self, cfg):
# fmt: off
stage_channel_factor = 2**3 # res5 is 8x res2
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
bottleneck_channels = num_groups * width_per_group * stage_channel_factor
out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
norm = cfg.MODEL.RESNETS.NORM
assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
'Deformable conv is not yet supported in res5 head.'
# fmt: on
blocks = make_stage(
BottleneckBlock,
3,
first_stride=2,
in_channels=out_channels // 2,
bottleneck_channels=bottleneck_channels,
out_channels=out_channels,
num_groups=num_groups,
norm=norm,
stride_in_1x1=stride_in_1x1,
)
return nn.Sequential(*blocks), out_channels
def _shared_roi_transform(self, features, boxes):
x = self.pooler(features, boxes)
x = self.res5(x)
return x
def forward(self, images, features, proposals, targets=None):
"""
See :class:`ROIHeads.forward`.
"""
del images
if self.training:
proposals = self.label_and_sample_proposals(proposals, targets)
del targets
proposal_boxes = [x.proposal_boxes for x in proposals]
box_features = self._shared_roi_transform(
[features[f] for f in self.in_features], proposal_boxes)
feature_pooled = box_features.mean(dim=[2, 3]) # pooled to 1x1
pred_class_logits, pred_proposal_deltas = self.box_predictor(
feature_pooled)
del feature_pooled
outputs = FastRCNNOutputs(
self.box2box_transform,
pred_class_logits,
pred_proposal_deltas,
proposals,
self.smooth_l1_beta,
)
if self.training:
del features
losses = outputs.losses()
return [], losses
else:
pred_instances, _ = outputs.inference(
self.test_score_thresh,
self.test_nms_thresh,
self.test_detections_per_img,
)
return pred_instances, {}

View File

@@ -0,0 +1,81 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import importlib
import sys
from collections import OrderedDict
from packaging import version
from modelscope.utils.import_utils import _torch_available
if sys.version_info < (3, 8):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
DETECTRON2_REQUIRED_VERSION = version.parse('0.3')
def is_detectron2_version_available():
_detectron2_available = importlib.util.find_spec('detectron2') is not None
_detectron2_version_available = False
if _detectron2_available:
_detectron2_version = version.parse(
importlib_metadata.version('detectron2'))
_detectron2_version_available = (_detectron2_version.major,
_detectron2_version.minor) == (
DETECTRON2_REQUIRED_VERSION.major,
DETECTRON2_REQUIRED_VERSION.minor)
return _detectron2_version_available
TORCH_REQUIRED_VERSION = version.parse('1.11')
def is_torch_version_available():
_torch_version_available = False
if _torch_available:
torch_version = version.parse(importlib_metadata.version('torch'))
_torch_version_available = (torch_version.major,
torch_version.minor) == (
TORCH_REQUIRED_VERSION.major,
TORCH_REQUIRED_VERSION.minor)
return _torch_version_available
DETECTRON2_IMPORT_ERROR = """
{0} requires the detectron2-0.3 but it was not found in your environment.
You can install it from modelscope lib with pip:
`pip install detectron2==0.3`
"""
TORCH_VERSION_IMPORT_ERROR = """
{0} requires the torch-1.11 but it was not found in your environment. You can install it with pip:
`pip install torch==1.11`
"""
REQUIREMENTS_MAAPING_VERSION = OrderedDict([
('detectron2-0.3', (is_detectron2_version_available,
DETECTRON2_IMPORT_ERROR)),
('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)),
])
REQUIREMENTS = ['detectron2-0.3', 'torch-1.11']
def requires_version():
checks = []
for req in REQUIREMENTS:
if req in REQUIREMENTS_MAAPING_VERSION:
check = REQUIREMENTS_MAAPING_VERSION[req]
else:
raise NotImplementedError('{} do not supported check'.format(req))
checks.append(check)
failed = [
msg.format('DeFRCN') for available, msg in checks if not available()
]
if failed:
raise ImportError(''.join(failed))

View File

@@ -0,0 +1,342 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/data/meta_voc.py
import os
import xml.etree.ElementTree as ET
import numpy as np
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
from fvcore.common.file_io import PathManager
# PASCAL VOC categories
PASCAL_VOC_ALL_CATEGORIES = {
1: [
'aeroplane',
'bicycle',
'boat',
'bottle',
'car',
'cat',
'chair',
'diningtable',
'dog',
'horse',
'person',
'pottedplant',
'sheep',
'train',
'tvmonitor',
'bird',
'bus',
'cow',
'motorbike',
'sofa',
],
2: [
'bicycle',
'bird',
'boat',
'bus',
'car',
'cat',
'chair',
'diningtable',
'dog',
'motorbike',
'person',
'pottedplant',
'sheep',
'train',
'tvmonitor',
'aeroplane',
'bottle',
'cow',
'horse',
'sofa',
],
3: [
'aeroplane',
'bicycle',
'bird',
'bottle',
'bus',
'car',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'person',
'pottedplant',
'train',
'tvmonitor',
'boat',
'cat',
'motorbike',
'sheep',
'sofa',
]
}
PASCAL_VOC_NOVEL_CATEGORIES = {
1: ['bird', 'bus', 'cow', 'motorbike', 'sofa'],
2: ['aeroplane', 'bottle', 'cow', 'horse', 'sofa'],
3: ['boat', 'cat', 'motorbike', 'sheep', 'sofa']
}
PASCAL_VOC_BASE_CATEGORIES = {
1: [
'aeroplane',
'bicycle',
'boat',
'bottle',
'car',
'cat',
'chair',
'diningtable',
'dog',
'horse',
'person',
'pottedplant',
'sheep',
'train',
'tvmonitor',
],
2: [
'bicycle',
'bird',
'boat',
'bus',
'car',
'cat',
'chair',
'diningtable',
'dog',
'motorbike',
'person',
'pottedplant',
'sheep',
'train',
'tvmonitor',
],
3: [
'aeroplane',
'bicycle',
'bird',
'bottle',
'bus',
'car',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'person',
'pottedplant',
'train',
'tvmonitor',
]
}
def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
classnames: str):
"""
Load Pascal VOC detection annotations to Detectron2 format.
Args:
dirname: Contain "Annotations", "ImageSets", "JPEGImages"
split (str): one of "train", "test", "val", "trainval"
"""
is_shots = 'shot' in name
dicts = []
if is_shots:
fileids = {}
# split_dir = os.path.join("datasets", "vocsplit")
split_dir = os.path.join(root, 'vocsplit')
shot = name.split('_')[-2].split('shot')[0]
seed = int(name.split('_seed')[-1])
split_dir = os.path.join(split_dir, 'seed{}'.format(seed))
for cls in classnames:
with PathManager.open(
os.path.join(split_dir,
'box_{}shot_{}_train.txt'.format(shot,
cls))) as f:
fileids_ = np.loadtxt(f, dtype=np.str).tolist()
if isinstance(fileids_, str):
fileids_ = [fileids_]
fileids_ = [
fid.split('/')[-1].split('.jpg')[0] for fid in fileids_
]
fileids[cls] = fileids_
for cls, fileids_ in fileids.items():
dicts_ = []
for fileid in fileids_:
year = '2012' if '_' in fileid else '2007'
# dirname = os.path.join("datasets", "VOC{}".format(year))
# anno_file = os.path.join(dirname, "Annotations", fileid + ".xml")
# jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
dir_voc = os.path.join(root, 'VOC{}'.format(year))
anno_file = os.path.join(dir_voc, 'Annotations',
fileid + '.xml')
jpeg_file = os.path.join(dir_voc, 'JPEGImages',
fileid + '.jpg')
tree = ET.parse(anno_file)
for obj in tree.findall('object'):
r = {
'file_name': jpeg_file,
'image_id': fileid,
'height': int(tree.findall('./size/height')[0].text),
'width': int(tree.findall('./size/width')[0].text),
}
cls_ = obj.find('name').text
if cls != cls_:
continue
bbox = obj.find('bndbox')
bbox = [
float(bbox.find(x).text)
for x in ['xmin', 'ymin', 'xmax', 'ymax']
]
bbox[0] -= 1.0
bbox[1] -= 1.0
instances = [{
'category_id': classnames.index(cls),
'bbox': bbox,
'bbox_mode': BoxMode.XYXY_ABS,
}]
r['annotations'] = instances
dicts_.append(r)
if len(dicts_) > int(shot):
dicts_ = np.random.choice(dicts_, int(shot), replace=False)
dicts.extend(dicts_)
else:
with PathManager.open(
os.path.join(root, dirname, 'ImageSets', 'Main',
split + '.txt')) as f:
fileids = np.loadtxt(f, dtype=np.str)
for fileid in fileids:
anno_file = os.path.join(root, dirname, 'Annotations',
fileid + '.xml')
jpeg_file = os.path.join(root, dirname, 'JPEGImages',
fileid + '.jpg')
tree = ET.parse(anno_file)
r = {
'file_name': jpeg_file,
'image_id': fileid,
'height': int(tree.findall('./size/height')[0].text),
'width': int(tree.findall('./size/width')[0].text),
}
instances = []
for obj in tree.findall('object'):
cls = obj.find('name').text
if not (cls in classnames):
continue
bbox = obj.find('bndbox')
bbox = [
float(bbox.find(x).text)
for x in ['xmin', 'ymin', 'xmax', 'ymax']
]
bbox[0] -= 1.0
bbox[1] -= 1.0
instances.append({
'category_id': classnames.index(cls),
'bbox': bbox,
'bbox_mode': BoxMode.XYXY_ABS,
})
r['annotations'] = instances
dicts.append(r)
return dicts
def register_meta_voc(name, root, dirname, split, year, keepclasses, sid):
if keepclasses.startswith('base_novel'):
thing_classes = PASCAL_VOC_ALL_CATEGORIES[sid]
elif keepclasses.startswith('base'):
thing_classes = PASCAL_VOC_BASE_CATEGORIES[sid]
elif keepclasses.startswith('novel'):
thing_classes = PASCAL_VOC_NOVEL_CATEGORIES[sid]
DatasetCatalog.register(
name,
lambda: load_filtered_voc_instances(name, root, dirname, split,
thing_classes),
)
MetadataCatalog.get(name).set(
thing_classes=thing_classes,
dirname=os.path.join(root, dirname),
year=year,
split=split,
base_classes=PASCAL_VOC_BASE_CATEGORIES[sid],
novel_classes=PASCAL_VOC_NOVEL_CATEGORIES[sid],
)
def register_all_voc(root='datasets'):
METASPLITS = [
('voc_2007_trainval_base1', 'VOC2007', 'trainval', 'base1', 1),
('voc_2007_trainval_base2', 'VOC2007', 'trainval', 'base2', 2),
('voc_2007_trainval_base3', 'VOC2007', 'trainval', 'base3', 3),
('voc_2012_trainval_base1', 'VOC2012', 'trainval', 'base1', 1),
('voc_2012_trainval_base2', 'VOC2012', 'trainval', 'base2', 2),
('voc_2012_trainval_base3', 'VOC2012', 'trainval', 'base3', 3),
('voc_2007_trainval_all1', 'VOC2007', 'trainval', 'base_novel_1', 1),
('voc_2007_trainval_all2', 'VOC2007', 'trainval', 'base_novel_2', 2),
('voc_2007_trainval_all3', 'VOC2007', 'trainval', 'base_novel_3', 3),
('voc_2012_trainval_all1', 'VOC2012', 'trainval', 'base_novel_1', 1),
('voc_2012_trainval_all2', 'VOC2012', 'trainval', 'base_novel_2', 2),
('voc_2012_trainval_all3', 'VOC2012', 'trainval', 'base_novel_3', 3),
('voc_2007_test_base1', 'VOC2007', 'test', 'base1', 1),
('voc_2007_test_base2', 'VOC2007', 'test', 'base2', 2),
('voc_2007_test_base3', 'VOC2007', 'test', 'base3', 3),
('voc_2007_test_novel1', 'VOC2007', 'test', 'novel1', 1),
('voc_2007_test_novel2', 'VOC2007', 'test', 'novel2', 2),
('voc_2007_test_novel3', 'VOC2007', 'test', 'novel3', 3),
('voc_2007_test_all1', 'VOC2007', 'test', 'base_novel_1', 1),
('voc_2007_test_all2', 'VOC2007', 'test', 'base_novel_2', 2),
('voc_2007_test_all3', 'VOC2007', 'test', 'base_novel_3', 3),
]
for prefix in ['all', 'novel']:
for sid in range(1, 4):
for shot in [1, 2, 3, 5, 10]:
for year in [2007, 2012]:
for seed in range(30):
seed = '_seed{}'.format(seed)
name = 'voc_{}_trainval_{}{}_{}shot{}'.format(
year, prefix, sid, shot, seed)
dirname = 'VOC{}'.format(year)
img_file = '{}_{}shot_split_{}_trainval'.format(
prefix, shot, sid)
keepclasses = ('base_novel_{}'.format(sid) if prefix
== 'all' else 'novel{}'.format(sid))
METASPLITS.append(
(name, dirname, img_file, keepclasses, sid))
for name, dirname, split, keepclasses, sid in METASPLITS:
if name in DatasetCatalog:
continue
year = 2007 if '2007' in name else 2012
register_meta_voc(
name,
root,
dirname,
split,
year,
keepclasses,
sid,
)
MetadataCatalog.get(name).evaluator_type = 'pascal_voc'

View File

@@ -82,6 +82,8 @@ TASK_INPUTS = {
InputType.IMAGE,
Tasks.portrait_matting:
InputType.IMAGE,
Tasks.image_fewshot_detection:
InputType.IMAGE,
# image editing task result for a single image
Tasks.skin_retouching:

View File

@@ -269,6 +269,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.image_multi_view_depth_estimation: (
Pipelines.image_multi_view_depth_estimation,
'damo/cv_casmvs_multi-view-depth-estimation_general'),
Tasks.image_fewshot_detection: (
Pipelines.image_fewshot_detection,
'damo/cv_resnet101_detection_fewshot-defrcn'),
Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
'damo/cv_flow-based-body-reshaping_damo'),
Tasks.image_face_fusion: (Pipelines.image_face_fusion,

View File

@@ -83,6 +83,7 @@ if TYPE_CHECKING:
from .image_mvs_depth_estimation_pipeline import ImageMultiViewDepthEstimationPipeline
from .panorama_depth_estimation_pipeline import PanoramaDepthEstimationPipeline
from .ddcolor_image_colorization_pipeline import DDColorImageColorizationPipeline
from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
else:
_import_structure = {
@@ -197,6 +198,7 @@ else:
'ddcolor_image_colorization_pipeline': [
'DDColorImageColorizationPipeline'
],
'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
}
import sys

View File

@@ -0,0 +1,104 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict
import numpy as np
import torch
from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
@PIPELINES.register_module(
Tasks.image_fewshot_detection,
module_name=Pipelines.image_fewshot_detection)
class ImageDefrcnDetectionPipeline(Pipeline):
""" Image DeFRCN few-shot detection Pipeline. Given a image,
pipeline will return the detection results on the image.
Example:
```python
>>> from modelscope.pipelines import pipeline
>>> detector = pipeline('image-fewshot-detection', 'damo/cv_resnet101_detection_fewshot-defrcn')
>>> detector('/Path/Image')
{
'scores': [0.8307567834854126, 0.1606406420469284],
'labels': ['person', 'dog'],
'boxes': [
[27.391937255859375, 0.0, 353.0, 500.0],
[64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375]
]
}
>>> #
```
"""
def __init__(self, model: str, **kwargs):
"""
model: model id on modelscope hub.
"""
super().__init__(model=model, auto_collate=False, **kwargs)
model_path = os.path.join(self.model.model_dir,
ModelFile.TORCH_MODEL_FILE)
self.model.model = self._load_pretrained(
self.model.model, model_path, self.model.model_cfg.MODEL.DEVICE)
def _load_pretrained(self, net, load_path, device='cuda', strict=True):
load_net = torch.load(load_path, map_location=device)
if 'scheduler' in load_net:
del load_net['scheduler']
if 'optimizer' in load_net:
del load_net['optimizer']
if 'iteration' in load_net:
del load_net['iteration']
net.load_state_dict(load_net['model'], strict=strict)
return net
def preprocess(self, input: Input) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input)
img = img.astype(np.float)
image = img[..., ::-1].copy() # rgb to bgr
tim = torch.Tensor(image).permute(2, 0, 1)
result = {'image': tim}
return result
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
outputs = self.model.inference(input)
result = {'data': outputs}
return result
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
if inputs['data'] is None:
outputs = {
OutputKeys.SCORES: [],
OutputKeys.LABELS: [],
OutputKeys.BOXES: []
}
return outputs
objects = inputs['data']['instances'].get_fields()
labels, bboxes = [], []
for label, box in zip(objects['pred_classes'], objects['pred_boxes']):
labels.append(self.model.config.model.classes[label])
bboxes.append(box.tolist())
scores = objects['scores'].tolist()
outputs = {
OutputKeys.SCORES: scores,
OutputKeys.LABELS: labels,
OutputKeys.BOXES: bboxes
}
return outputs

View File

@@ -10,6 +10,7 @@ if TYPE_CHECKING:
from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
from .image_inpainting_trainer import ImageInpaintingTrainer
from .referring_video_object_segmentation_trainer import ReferringVideoObjectSegmentationTrainer
from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer
else:
_import_structure = {
@@ -20,7 +21,9 @@ else:
'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
'image_inpainting_trainer': ['ImageInpaintingTrainer'],
'referring_video_object_segmentation_trainer':
['ReferringVideoObjectSegmentationTrainer']
['ReferringVideoObjectSegmentationTrainer'],
'image_defrcn_fewshot_detection_trainer':
['ImageDefrcnFewshotTrainer']
}
import sys

View File

@@ -0,0 +1,316 @@
# The implementation is adopted from er-muyue/DeFRCN
# made publicly available under the MIT License at
# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/engine/defaults.py
# https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py
import os
from typing import Callable, Optional, Union
import torch
from detectron2.engine import SimpleTrainer, hooks
from detectron2.evaluation import DatasetEvaluators, verify_results
from detectron2.utils import comm
from torch import nn
from modelscope.metainfo import Trainers
from modelscope.models.base import Model, TorchModel
from modelscope.trainers.base import BaseTrainer
from modelscope.trainers.builder import TRAINERS
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
from modelscope.utils.logger import get_logger
class DefaultTrainer(SimpleTrainer):
def __init__(self, model, cfg):
from collections import OrderedDict
from fvcore.nn.precise_bn import get_bn_modules
from torch.nn.parallel import DistributedDataParallel
from detectron2.data.build import build_detection_train_loader, build_detection_test_loader
from detectron2.solver.build import build_optimizer, build_lr_scheduler
from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
from detectron2.utils.logger import setup_logger
setup_logger()
optimizer = build_optimizer(cfg, model)
data_loader = build_detection_train_loader(cfg)
if comm.get_world_size() > 1:
model = DistributedDataParallel(
model,
device_ids=[comm.get_local_rank()],
broadcast_buffers=False,
find_unused_parameters=True)
super().__init__(model, data_loader, optimizer)
self.scheduler = build_lr_scheduler(cfg, optimizer)
self.checkpointer = DetectionCheckpointer(
model,
cfg.OUTPUT_DIR,
optimizer=optimizer,
scheduler=self.scheduler,
)
self.start_iter = 0
self.max_iter = cfg.SOLVER.MAX_ITER
self.cfg = cfg
self.register_hooks(self.build_hooks())
def resume_or_load(self, resume=True):
# The checkpoint stores the training iteration that just finished, thus we start
# at the next iteration (or iter zero if there's no checkpoint).
self.start_iter = (
self.checkpointer.resume_or_load(
self.cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1)
+ 1)
def build_hooks(self):
"""
Build a list of default hooks, including timing, evaluation,
checkpointing, lr scheduling, precise BN, writing events.
Returns:
list[HookBase]:
"""
cfg = self.cfg.clone()
cfg.defrost()
cfg.DATALOADER.NUM_WORKERS = 0
ret = [
hooks.IterationTimer(),
hooks.LRScheduler(self.optimizer, self.scheduler),
hooks.PreciseBN(
cfg.TEST.EVAL_PERIOD,
self.model,
build_detection_train_loader(cfg),
cfg.TEST.PRECISE_BN.NUM_ITER,
) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
else None,
]
if comm.is_main_process():
ret.append(
hooks.PeriodicCheckpointer(self.checkpointer,
cfg.SOLVER.CHECKPOINT_PERIOD))
def test_and_save_results():
self._last_eval_results = self.test(self.cfg, self.model)
return self._last_eval_results
ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
if comm.is_main_process():
ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
return ret
def build_writers(self):
from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
return [
CommonMetricPrinter(self.max_iter),
JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, 'metrics.json')),
TensorboardXWriter(self.cfg.OUTPUT_DIR),
]
def train(self):
"""
Run training.
Returns:
OrderedDict of results, if evaluation is enabled. Otherwise None.
"""
super().train(self.start_iter, self.max_iter)
if hasattr(self, '_last_eval_results') and comm.is_main_process():
verify_results(self.cfg, self._last_eval_results)
return self._last_eval_results
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
from detectron2.data import MetadataCatalog
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, 'inference')
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type == 'coco':
from detectron2.evaluation import COCOEvaluator
evaluator_list.append(
COCOEvaluator(dataset_name, True, output_folder))
if evaluator_type == 'pascal_voc':
from detectron2.evaluation import PascalVOCDetectionEvaluator
return PascalVOCDetectionEvaluator(dataset_name)
if len(evaluator_list) == 0:
raise NotImplementedError(
'no Evaluator for the dataset {} with the type {}'.format(
dataset_name, evaluator_type))
if len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def test(cls, cfg, model, evaluators=None):
from detectron2.engine.defaults import DefaultTrainer as _DefaultTrainer
_DefaultTrainer.build_evaluator = cls.build_evaluator
return _DefaultTrainer.test(cfg, model, evaluators)
@TRAINERS.register_module(module_name=Trainers.image_fewshot_detection)
class ImageDefrcnFewshotTrainer(BaseTrainer):
def __init__(self,
model: Optional[Union[TorchModel, nn.Module, str]] = None,
cfg_file: Optional[str] = None,
arg_parse_fn: Optional[Callable] = None,
model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
seed: int = 0,
cfg_modify_fn: Optional[Callable] = None,
**kwargs):
if isinstance(model, str):
self.model_dir = self.get_or_download_model_dir(
model, model_revision)
if cfg_file is None:
cfg_file = os.path.join(self.model_dir,
ModelFile.CONFIGURATION)
else:
assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
self.model_dir = os.path.dirname(cfg_file)
super().__init__(cfg_file, arg_parse_fn)
if cfg_modify_fn is not None:
self.cfg = cfg_modify_fn(self.cfg)
self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
if isinstance(model, (TorchModel, nn.Module)):
self.model = model
else:
self.model = self.build_model(**kwargs)
self.model_cfg = self.model.get_model_cfg()
if 'datasets_train' in kwargs:
self.model_cfg.merge_from_list(
['DATASETS.TRAIN', kwargs['datasets_train']])
if 'datasets_test' in kwargs:
self.model_cfg.merge_from_list(
['DATASETS.TEST', kwargs['datasets_test']])
if 'work_dir' in kwargs:
self.model_cfg.merge_from_list(['OUTPUT_DIR', kwargs['work_dir']])
if not os.path.exists(self.model_cfg.OUTPUT_DIR):
os.makedirs(self.model_cfg.OUTPUT_DIR)
self.model_cfg.freeze()
self.data_dir = kwargs.get('data_dir', None)
self.data_type = kwargs.get('data_type', 'pascal_voc')
self.register_data(self.data_type, self.data_dir)
self.trainer = DefaultTrainer(self.model, self.model_cfg)
def train(self, *args, **kwargs):
self.trainer.resume_or_load()
self.trainer.train()
def evaluate(self, checkpoint_path: str, *args, **kwargs):
from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
DetectionCheckpointer(
self.model,
save_dir=self.model_cfg.OUTPUT_DIR).resume_or_load(checkpoint_path)
metric_values = DefaultTrainer.test(self.model_cfg, self.model)
return metric_values
def build_model(self, *args, **kwargs) -> Union[nn.Module, TorchModel]:
model = Model.from_pretrained(self.model_dir, **kwargs)
if not isinstance(model, nn.Module) and hasattr(model, 'model'):
return model.model
elif isinstance(model, nn.Module):
return model
@classmethod
def register_data(cls, data_type='pascal_voc', data_dir=None):
if data_type == 'pascal_voc':
from modelscope.models.cv.image_defrcn_fewshot.utils.voc_register import register_all_voc
if data_dir:
register_all_voc(data_dir)
else:
register_all_voc()
else:
raise NotImplementedError(
'no {} dataset was registered'.format(data_type))
@classmethod
def model_surgery(cls,
src_path,
save_dir,
data_type='pascal_voc',
method='remove'):
assert method in ['remove',
'randinit'], '{} not implemented'.format(method)
def _surgery(param_name, is_weight, tar_size, ckpt):
weight_name = param_name + ('.weight' if is_weight else '.bias')
pretrained_weight = ckpt['model'][weight_name]
prev_cls = pretrained_weight.size(0)
if 'cls_score' in param_name:
prev_cls -= 1
if is_weight:
feat_size = pretrained_weight.size(1)
new_weight = torch.rand((tar_size, feat_size))
torch.nn.init.normal_(new_weight, 0, 0.01)
else:
new_weight = torch.zeros(tar_size)
new_weight[:prev_cls] = pretrained_weight[:prev_cls]
if 'cls_score' in param_name:
new_weight[-1] = pretrained_weight[-1] # bg class
ckpt['model'][weight_name] = new_weight
if data_type == 'pascal_voc':
TAR_SIZE = 20
params_name = [
'model.roi_heads.box_predictor.cls_score',
'model.roi_heads.box_predictor.bbox_pred'
]
save_name = 'model_reset_' + ('remove' if method == 'remove' else
'surgery') + '.pth'
save_path = os.path.join(save_dir, save_name)
os.makedirs(save_dir, exist_ok=True)
ckpt = torch.load(src_path)
if 'scheduler' in ckpt:
del ckpt['scheduler']
if 'optimizer' in ckpt:
del ckpt['optimizer']
if 'iteration' in ckpt:
ckpt['iteration'] = 0
if method == 'remove':
for param_name in params_name:
del ckpt['model'][param_name + '.weight']
if param_name + '.bias' in ckpt['model']:
del ckpt['model'][param_name + '.bias']
else:
tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4]
for idx, (param_name,
tar_size) in enumerate(zip(params_name, tar_sizes)):
_surgery(param_name, True, tar_size, ckpt)
_surgery(param_name, False, tar_size, ckpt)
torch.save(ckpt, save_path)
else:
NotImplementedError(
'{} dataset does not supported'.format(data_type))

View File

@@ -46,6 +46,7 @@ class CVTasks(object):
image_object_detection = 'image-object-detection'
video_object_detection = 'video-object-detection'
image_fewshot_detection = 'image-fewshot-detection'
image_segmentation = 'image-segmentation'
semantic_segmentation = 'semantic-segmentation'

View File

@@ -0,0 +1,62 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import subprocess
import sys
import unittest
from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.demo_utils import DemoCompatibilityCheck
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import test_level
logger = get_logger()
class ImageDefrcnFewShotTest(unittest.TestCase, DemoCompatibilityCheck):
def setUp(self) -> None:
logger.info('start install detectron2-0.3')
cmd = [
sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
]
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info('install detectron2-0.3 finished')
self.task = Tasks.image_fewshot_detection
self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
self.image = 'data/test/images/image_voc2007_000001.jpg'
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
pipeline_defrcn = pipeline(task=self.task, model=model)
print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_defrcn = pipeline(task=self.task, model=self.model_id)
print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_defrcn = pipeline(task=self.task)
print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
cache_path = snapshot_download(self.model_id)
pipeline_defrcn = pipeline(self.task, model=cache_path)
print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
def test_demo_compatibility(self):
self.compatibility_check()
if __name__ == '__main__':
unittest.main()

View File

@@ -49,6 +49,7 @@ isolated: # test cases that may require excessive anmount of GPU memory or run
- test_kws_nearfield_trainer.py
- test_gpt3_text_generation.py
- test_ddcolor_image_colorization.py
- test_image_defrcn_fewshot_trainer.py
- test_image_deblur_trainer.py
envs:

View File

@@ -0,0 +1,70 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import subprocess
import sys
import tempfile
import unittest
from modelscope.hub.utils.utils import get_cache_dir
from modelscope.metainfo import Trainers
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import DownloadMode
from modelscope.utils.test_utils import test_level
class TestImageDefrcnFewShotTrainer(unittest.TestCase):
def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
cmd = [
sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
]
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)
self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
data_voc = MsDataset.load(
dataset_name='VOC_fewshot',
namespace='shimin2023',
split='train',
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
self.data_dir = os.path.join(
data_voc.config_kwargs['split_config']['train'], 'data')
def tearDown(self):
shutil.rmtree(self.tmp_dir)
super().tearDown()
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_trainer(self):
split = 1
kwargs = dict(
model=self.model_id,
data_dir=self.data_dir,
work_dir=self.tmp_dir,
model_weights=os.path.join(get_cache_dir(), self.model_id,
'ImageNetPretrained/MSRA/R-101.pkl'),
data_type='pascal_voc',
config_path='defrcn_det_r101_base{}.yaml'.format(split),
datasets_train=('voc_2007_trainval_base{}'.format(split),
'voc_2012_trainval_base{}'.format(split)),
datasets_test=('voc_2007_test_base{}'.format(split), ))
trainer = build_trainer(
name=Trainers.image_fewshot_detection, default_args=kwargs)
trainer.train()
results_files = os.listdir(self.tmp_dir)
self.assertIn('metrics.json', results_files)
self.assertIn('model_final.pth', results_files)
if __name__ == '__main__':
unittest.main()