add cv/image-defrcn-fewshot-detection

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11364804 * add model defrcn-fewshot-detection * add requirements check
2026-02-24 04:01:10 +01:00 · 2023-01-12 12:48:38 +00:00
parent 8cd79a4fea
commit f7930c23a0
24 changed files with 1965 additions and 16 deletions
--- a/data/test/images/image_voc2007_000001.jpg
+++ b/data/test/images/image_voc2007_000001.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f0bdad67d01aa452929683b74a124a2926b6bce534c85f3ee0f00e20eeacab0
+size 78771
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -76,6 +76,7 @@ class Models(object):
    image_casmvs_depth_estimation = 'image-casmvs-depth-estimation'
    vop_retrieval_model = 'vop-retrieval-model'
    ddcolor = 'ddcolor'
+    defrcn = 'defrcn'
    image_face_fusion = 'image-face-fusion'

    # EasyCV models
@@ -296,6 +297,7 @@ class Pipelines(object):
    image_multi_view_depth_estimation = 'image-multi-view-depth-estimation'
    vop_retrieval = 'vop-video-text-retrieval'
    ddcolor_image_colorization = 'ddcolor-image-colorization'
+    image_fewshot_detection = 'image-fewshot-detection'
    image_face_fusion = 'image-face-fusion'

    # nlp tasks
@@ -416,6 +418,7 @@ class Trainers(object):
    referring_video_object_segmentation = 'referring-video-object-segmentation'
    image_classification_team = 'image-classification-team'
    image_classification = 'image-classification'
+    image_fewshot_detection = 'image-fewshot-detection'

    # nlp trainers
    bert_sentiment_analysis = 'bert-sentiment-analysis'
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -5,20 +5,20 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
               body_3d_keypoints, cartoon, cmdssl_video_embedding,
               crowd_counting, face_2d_keypoints, face_detection,
               face_generation, human_wholebody_keypoint, image_classification,
-               image_color_enhance, image_colorization, image_denoise,
-               image_inpainting, image_instance_segmentation, image_matching,
-               image_mvs_depth_estimation, image_panoptic_segmentation,
-               image_portrait_enhancement, image_reid_person,
-               image_semantic_segmentation, image_to_image_generation,
-               image_to_image_translation, language_guided_video_summarization,
-               movie_scene_segmentation, object_detection,
-               panorama_depth_estimation, pointcloud_sceneflow_estimation,
-               product_retrieval_embedding, realtime_object_detection,
-               referring_video_object_segmentation, salient_detection,
-               shop_segmentation, super_resolution, video_frame_interpolation,
-               video_object_segmentation, video_single_object_tracking,
-               video_stabilization, video_summarization,
-               video_super_resolution, virual_tryon, vision_middleware,
-               vop_retrieval)
+               image_color_enhance, image_colorization, image_defrcn_fewshot,
+               image_denoise, image_inpainting, image_instance_segmentation,
+               image_matching, image_mvs_depth_estimation,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_semantic_segmentation,
+               image_to_image_generation, image_to_image_translation,
+               language_guided_video_summarization, movie_scene_segmentation,
+               object_detection, panorama_depth_estimation,
+               pointcloud_sceneflow_estimation, product_retrieval_embedding,
+               realtime_object_detection, referring_video_object_segmentation,
+               salient_detection, shop_segmentation, super_resolution,
+               video_frame_interpolation, video_object_segmentation,
+               video_single_object_tracking, video_stabilization,
+               video_summarization, video_super_resolution, virual_tryon,
+               vision_middleware, vop_retrieval)

 # yapf: enable
--- a/modelscope/models/cv/image_defrcn_fewshot/init.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .defrcn_for_fewshot import DeFRCNForFewShot
+
+else:
+    _import_structure = {'defrcn_for_fewshot': ['DeFRCNForFewShot']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .models.defaults_config import _C
+from .models.defrcn import DeFRCN
+from .utils.requirements_check import requires_version
+
+logger = get_logger()
+__all__ = ['DeFRCNForFewShot']
+
+
+@MODELS.register_module(
+    Tasks.image_fewshot_detection, module_name=Models.defrcn)
+class DeFRCNForFewShot(TorchModel):
+    """ Few-shot object detection model DeFRCN. The model requires detectron2-0.3 and pytorch-1.11.
+        Model config params mainly from detectron2, you can use detectron2 config file to initialize model.
+        Detail configs can be visited on detectron2.config.defaults and .models.defaults_config.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the few-shot defrcn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        requires_version()
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        if 'config_path' in kwargs:
+            self.config.merge_from_dict(
+                {'model.config_path': kwargs['config_path']})
+
+        self.model_cfg = _C.clone()
+        self.model_cfg.merge_from_file(
+            os.path.join(model_dir, self.config.model.config_path))
+
+        if 'model_weights' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['MODEL.WEIGHTS', kwargs['model_weights']])
+
+        self.model_cfg.freeze()
+
+        self.model = DeFRCN(self.model_cfg)
+
+    def forward(self, inputs) -> Any:
+        """return the result by the model
+
+        Args:
+            inputs (list): the preprocessed data
+
+        Returns:
+            Any: results
+        """
+        if self.training:
+            return self.model.forward(inputs)
+        else:
+            return self.model.inference(inputs)
+
+    def inference(self, input: Dict[str, Any]) -> Any:
+        with torch.no_grad():
+            results = self.model([input])
+        return results[0] if len(results) > 0 else None
+
+    def get_model_cfg(self):
+        return self.model_cfg
--- a/modelscope/models/cv/image_defrcn_fewshot/models/init.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .defrcn import DeFRCN
+
+else:
+    _import_structure = {'defrcn': ['DeFRCN']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
@@ -0,0 +1,38 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py
+
+from detectron2.config.defaults import _C
+
+_CC = _C
+
+# ----------- Backbone ----------- #
+_CC.MODEL.BACKBONE.FREEZE = False
+_CC.MODEL.BACKBONE.FREEZE_AT = 3
+
+# ------------- RPN -------------- #
+_CC.MODEL.RPN.FREEZE = False
+_CC.MODEL.RPN.ENABLE_DECOUPLE = False
+_CC.MODEL.RPN.BACKWARD_SCALE = 1.0
+
+# ------------- ROI -------------- #
+_CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads'
+_CC.MODEL.ROI_HEADS.FREEZE_FEAT = False
+_CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False
+_CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0
+_CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers'
+_CC.MODEL.ROI_HEADS.CLS_DROPOUT = False
+_CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8
+_CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7  # for faster
+
+# ------------- TEST ------------- #
+_CC.TEST.PCB_ENABLE = False
+_CC.TEST.PCB_MODELTYPE = 'resnet'  # res-like
+_CC.TEST.PCB_MODELPATH = ''
+_CC.TEST.PCB_ALPHA = 0.50
+_CC.TEST.PCB_UPPER = 1.0
+_CC.TEST.PCB_LOWER = 0.05
+
+# ------------ Other ------------- #
+_CC.SOLVER.WEIGHT_DECAY = 5e-5
+_CC.MUTE_HEADER = True
--- a/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py
@@ -0,0 +1,179 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
+
+import os
+from typing import Dict
+
+import torch
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone.resnet import build_resnet_backbone
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.modeling.proposal_generator.rpn import RPN, StandardRPNHead
+from detectron2.structures import ImageList
+from torch import nn
+
+from .gdl import AffineLayer, decouple_layer
+from .roi_heads import Res5ROIHeads
+
+
+class DeFRCN(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.backbone = build_resnet_backbone(
+            cfg, ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
+        self._SHAPE_ = self.backbone.output_shape()
+
+        rpn_config = DeFRCN.from_rpn_config(cfg, self._SHAPE_)
+        self.proposal_generator = RPN(**rpn_config)
+
+        self.roi_heads = Res5ROIHeads(cfg, self._SHAPE_)
+        self.normalizer = self.normalize_fn()
+        self.affine_rpn = AffineLayer(
+            num_channels=self._SHAPE_['res4'].channels, bias=True)
+        self.affine_rcnn = AffineLayer(
+            num_channels=self._SHAPE_['res4'].channels, bias=True)
+        self.to(self.device)
+
+        if cfg.MODEL.BACKBONE.FREEZE:
+            for p in self.backbone.parameters():
+                p.requires_grad = False
+
+        if cfg.MODEL.RPN.FREEZE:
+            for p in self.proposal_generator.parameters():
+                p.requires_grad = False
+
+        if cfg.MODEL.ROI_HEADS.FREEZE_FEAT:
+            for p in self.roi_heads.res5.parameters():
+                p.requires_grad = False
+
+    def forward(self, batched_inputs):
+        if not self.training:
+            return self.inference(batched_inputs)
+        assert 'instances' in batched_inputs[0]
+        gt_instances = [x['instances'].to(self.device) for x in batched_inputs]
+        proposal_losses, detector_losses, _, _ = self._forward_once_(
+            batched_inputs, gt_instances)
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(self, batched_inputs):
+        assert not self.training
+        _, _, results, image_sizes = self._forward_once_(batched_inputs, None)
+        processed_results = []
+        for r, input, image_size in zip(results, batched_inputs, image_sizes):
+            height = input.get('height', image_size[0])
+            width = input.get('width', image_size[1])
+            r = detector_postprocess(r, height, width)
+            processed_results.append({'instances': r})
+        return processed_results
+
+    def _forward_once_(self, batched_inputs, gt_instances=None):
+
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        features_de_rpn = features
+        if self.cfg.MODEL.RPN.ENABLE_DECOUPLE:
+            scale = self.cfg.MODEL.RPN.BACKWARD_SCALE
+            features_de_rpn = {
+                k: self.affine_rpn(decouple_layer(features[k], scale))
+                for k in features
+            }
+        proposals, proposal_losses = self.proposal_generator(
+            images, features_de_rpn, gt_instances)
+
+        features_de_rcnn = features
+        if self.cfg.MODEL.ROI_HEADS.ENABLE_DECOUPLE:
+            scale = self.cfg.MODEL.ROI_HEADS.BACKWARD_SCALE
+            features_de_rcnn = {
+                k: self.affine_rcnn(decouple_layer(features[k], scale))
+                for k in features
+            }
+        results, detector_losses = self.roi_heads(images, features_de_rcnn,
+                                                  proposals, gt_instances)
+
+        return proposal_losses, detector_losses, results, images.image_sizes
+
+    def preprocess_image(self, batched_inputs):
+        images = [x['image'].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images,
+                                        self.backbone.size_divisibility)
+        return images
+
+    def normalize_fn(self):
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        pixel_mean = (
+            torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).to(self.device).view(
+                num_channels, 1, 1))
+        pixel_std = (
+            torch.Tensor(self.cfg.MODEL.PIXEL_STD).to(self.device).view(
+                num_channels, 1, 1))
+        return lambda x: (x - pixel_mean) / pixel_std
+
+    @classmethod
+    def from_rpn_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        in_features = cfg.MODEL.RPN.IN_FEATURES
+        ret = {
+            'in_features':
+            in_features,
+            'min_box_size':
+            cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
+            'nms_thresh':
+            cfg.MODEL.RPN.NMS_THRESH,
+            'batch_size_per_image':
+            cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
+            'positive_fraction':
+            cfg.MODEL.RPN.POSITIVE_FRACTION,
+            'loss_weight': {
+                'loss_rpn_cls':
+                cfg.MODEL.RPN.LOSS_WEIGHT,
+                'loss_rpn_loc':
+                cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
+            },
+            'anchor_boundary_thresh':
+            cfg.MODEL.RPN.BOUNDARY_THRESH,
+            'box2box_transform':
+            Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
+            'box_reg_loss_type':
+            cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
+            'smooth_l1_beta':
+            cfg.MODEL.RPN.SMOOTH_L1_BETA,
+        }
+
+        ret['pre_nms_topk'] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN,
+                               cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
+        ret['post_nms_topk'] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN,
+                                cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
+
+        # ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
+        anchor_cfg = DefaultAnchorGenerator.from_config(
+            cfg, [input_shape[f] for f in in_features])
+        ret['anchor_generator'] = DefaultAnchorGenerator(**anchor_cfg)
+        ret['anchor_matcher'] = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS,
+            cfg.MODEL.RPN.IOU_LABELS,
+            allow_low_quality_matches=True)
+        rpn_head_cfg = {
+            'in_channels':
+            [s.channels for s in [input_shape[f] for f in in_features]][0],
+            'num_anchors':
+            ret['anchor_generator'].num_anchors[0],
+            'box_dim':
+            ret['anchor_generator'].box_dim
+        }
+
+        ret['head'] = StandardRPNHead(**rpn_head_cfg)
+        return ret
--- a/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
@@ -0,0 +1,274 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
+
+import numpy as np
+import torch
+from detectron2.layers import batched_nms, cat
+from detectron2.modeling.roi_heads.fast_rcnn import \
+    fast_rcnn_inference_single_image
+from detectron2.utils.events import get_event_storage
+from fvcore.nn import smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+
+
+def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh,
+                        topk_per_image):
+
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image,
+            scores_per_image,
+            image_shape,
+            score_thresh,
+            nms_thresh,
+            topk_per_image,
+        ) for scores_per_image, boxes_per_image, image_shape in zip(
+            scores, boxes, image_shapes)
+    ]
+    return tuple(list(x) for x in zip(*result_per_image))
+
+
+class FastRCNNOutputs(object):
+    """
+    A class that stores information about outputs of a Fast R-CNN head.
+    """
+
+    def __init__(
+        self,
+        box2box_transform,
+        pred_class_logits,
+        pred_proposal_deltas,
+        proposals,
+        smooth_l1_beta,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
+                box2box transform instance for proposal-to-detection transformations.
+            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
+                logits for all R predicted object instances.
+                Each row corresponds to a predicted object instance.
+            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
+                class-specific or class-agnostic regression. It stores the predicted deltas that
+                transform proposals into final box detections.
+                B is the box dimension (4 or 5).
+                When B is 4, each row is [dx, dy, dw, dh (, ....)].
+                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
+            proposals (list[Instances]): A list of N Instances, where Instances i stores the
+                proposals for image i, in the field "proposal_boxes".
+                When training, each Instances must have ground-truth labels
+                stored in the field "gt_classes" and "gt_boxes".
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.num_preds_per_image = [len(p) for p in proposals]
+        self.pred_class_logits = pred_class_logits
+        self.pred_proposal_deltas = pred_proposal_deltas
+        self.smooth_l1_beta = smooth_l1_beta
+
+        box_type = type(proposals[0].proposal_boxes)
+        # cat(..., dim=0) concatenates over all images in the batch
+        self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
+        assert (not self.proposals.tensor.requires_grad
+                ), 'Proposals should not require gradients!'
+        self.image_shapes = [x.image_size for x in proposals]
+
+        # The following fields should exist only when training.
+        if proposals[0].has('gt_boxes'):
+            self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
+            assert proposals[0].has('gt_classes')
+            self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
+
+    def _log_accuracy(self):
+        """
+        Log the accuracy metrics to EventStorage.
+        """
+        num_instances = self.gt_classes.numel()
+        pred_classes = self.pred_class_logits.argmax(dim=1)
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)
+        num_fg = fg_inds.nonzero().numel()
+        fg_gt_classes = self.gt_classes[fg_inds]
+        fg_pred_classes = pred_classes[fg_inds]
+
+        num_false_negative = ((
+            fg_pred_classes == bg_class_ind).nonzero().numel())
+        num_accurate = (pred_classes == self.gt_classes).nonzero().numel()
+        fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+
+        storage = get_event_storage()
+        storage.put_scalar('fast_rcnn/cls_accuracy',
+                           num_accurate / num_instances)
+        if num_fg > 0:
+            storage.put_scalar('fast_rcnn/fg_cls_accuracy',
+                               fg_num_accurate / num_fg)
+            storage.put_scalar('fast_rcnn/false_negative',
+                               num_false_negative / num_fg)
+
+    def softmax_cross_entropy_loss(self):
+        """
+        Compute the softmax cross entropy loss for box classification.
+
+        Returns:
+            scalar Tensor
+        """
+        self._log_accuracy()
+        return F.cross_entropy(
+            self.pred_class_logits, self.gt_classes, reduction='mean')
+
+    def smooth_l1_loss(self):
+        """
+        Compute the smooth L1 loss for box regression.
+
+        Returns:
+            scalar Tensor
+        """
+        gt_proposal_deltas = self.box2box_transform.get_deltas(
+            self.proposals.tensor, self.gt_boxes.tensor)
+        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
+        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
+        device = self.pred_proposal_deltas.device
+
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = torch.nonzero((self.gt_classes >= 0)
+                                & (self.gt_classes < bg_class_ind)).squeeze(1)
+        if cls_agnostic_bbox_reg:
+            # pred_proposal_deltas only corresponds to foreground class for agnostic
+            gt_class_cols = torch.arange(box_dim, device=device)
+        else:
+            fg_gt_classes = self.gt_classes[fg_inds]
+            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
+                box_dim, device=device)
+
+        loss_box_reg = smooth_l1_loss(
+            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
+            gt_proposal_deltas[fg_inds],
+            self.smooth_l1_beta,
+            reduction='sum',
+        )
+
+        loss_box_reg = loss_box_reg / self.gt_classes.numel()
+        return loss_box_reg
+
+    def losses(self):
+        """
+        Compute the default losses for box head in Fast(er) R-CNN,
+        with softmax cross entropy loss and smooth L1 loss.
+
+        Returns:
+            A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg".
+        """
+        return {
+            'loss_cls': self.softmax_cross_entropy_loss(),
+            'loss_box_reg': self.smooth_l1_loss(),
+        }
+
+    def predict_boxes(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        num_pred = len(self.proposals)
+        B = self.proposals.tensor.shape[1]
+        K = self.pred_proposal_deltas.shape[1] // B
+        boxes = self.box2box_transform.apply_deltas(
+            self.pred_proposal_deltas.view(num_pred * K, B),
+            self.proposals.tensor.unsqueeze(1).expand(num_pred, K,
+                                                      B).reshape(-1, B),
+        )
+        return boxes.view(num_pred, K * B).split(
+            self.num_preds_per_image, dim=0)
+
+    def predict_probs(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+                for image i.
+        """
+        probs = F.softmax(self.pred_class_logits, dim=-1)
+        return probs.split(self.num_preds_per_image, dim=0)
+
+    def inference(self, score_thresh, nms_thresh, topk_per_image):
+        """
+        Args:
+            score_thresh (float): same as fast_rcnn_inference.
+            nms_thresh (float): same as fast_rcnn_inference.
+            topk_per_image (int): same as fast_rcnn_inference.
+        Returns:
+            list[Instances]: same as fast_rcnn_inference.
+            list[Tensor]: same as fast_rcnn_inference.
+        """
+        boxes = self.predict_boxes()
+        scores = self.predict_probs()
+        image_shapes = self.image_shapes
+
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            score_thresh,
+            nms_thresh,
+            topk_per_image,
+        )
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+      (1) proposal-to-detection box regression deltas
+      (2) classification scores
+    """
+
+    def __init__(self,
+                 cfg,
+                 input_size,
+                 num_classes,
+                 cls_agnostic_bbox_reg,
+                 box_dim=4):
+        """
+        Args:
+            cfg: config
+            input_size (int): channels, or (channels, height, width)
+            num_classes (int): number of foreground classes
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            box_dim (int): the dimension of bounding boxes.
+                Example box dimensions: 4 for regular XYXY boxes and 5 for rotated XYWHA boxes
+        """
+        super(FastRCNNOutputLayers, self).__init__()
+
+        if not isinstance(input_size, int):
+            input_size = np.prod(input_size)
+
+        # The prediction layer for num_classes foreground classes and one
+        # background class
+        self.cls_score = nn.Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for b in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(b.bias, 0)
+
+        self._do_cls_dropout = cfg.MODEL.ROI_HEADS.CLS_DROPOUT
+        self._dropout_ratio = cfg.MODEL.ROI_HEADS.DROPOUT_RATIO
+
+    def forward(self, x):
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        proposal_deltas = self.bbox_pred(x)
+
+        if self._do_cls_dropout:
+            x = F.dropout(x, self._dropout_ratio, training=self.training)
+        scores = self.cls_score(x)
+
+        return scores, proposal_deltas
--- a/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py
@@ -0,0 +1,43 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/gdl.py
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+
+class GradientDecoupleLayer(Function):
+
+    @staticmethod
+    def forward(ctx, x, _lambda):
+        ctx._lambda = _lambda
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output * ctx._lambda
+        return grad_output, None
+
+
+class AffineLayer(nn.Module):
+
+    def __init__(self, num_channels, bias=False):
+        super(AffineLayer, self).__init__()
+        weight = torch.FloatTensor(1, num_channels, 1, 1).fill_(1)
+        self.weight = nn.Parameter(weight, requires_grad=True)
+
+        self.bias = None
+        if bias:
+            bias = torch.FloatTensor(1, num_channels, 1, 1).fill_(0)
+            self.bias = nn.Parameter(bias, requires_grad=True)
+
+    def forward(self, X):
+        out = X * self.weight.expand_as(X)
+        if self.bias is not None:
+            out = out + self.bias.expand_as(X)
+        return out
+
+
+def decouple_layer(x, _lambda):
+    return GradientDecoupleLayer.apply(x, _lambda)
--- a/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
@@ -0,0 +1,302 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/roi_heads/roi_heads.py
+
+from typing import Dict
+
+import numpy as np
+import torch
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone.resnet import BottleneckBlock, make_stage
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator.proposal_utils import \
+    add_ground_truth_to_proposals
+from detectron2.modeling.roi_heads import select_foreground_proposals
+from detectron2.modeling.sampling import subsample_labels
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from torch import nn
+
+from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
+
+
+class ROIHeads(torch.nn.Module):
+    """
+    ROIHeads perform all per-region computation in an R-CNN.
+
+    It contains logic of cropping the regions, extract per-region features,
+    and make per-region predictions.
+
+    It can have many variants, implemented as subclasses of this class.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super(ROIHeads, self).__init__()
+
+        # fmt: off
+        self.batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+        self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+        self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+        self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
+        self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE
+        self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.proposal_append_gt = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT
+        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
+        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
+        self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
+        self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+        # fmt: on
+
+        # Matcher to assign box proposals to gt boxes
+        self.proposal_matcher = Matcher(
+            cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
+            cfg.MODEL.ROI_HEADS.IOU_LABELS,
+            allow_low_quality_matches=False,
+        )
+
+        # Box2BoxTransform for bounding box regression
+        self.box2box_transform = Box2BoxTransform(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
+
+    def _sample_proposals(self, matched_idxs, matched_labels, gt_classes):
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes,
+            self.batch_size_per_image,
+            self.positive_sample_fraction,
+            self.num_classes,
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the ROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`ROIHeads.forward`
+
+        Returns:
+            list[Instances]:
+                length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the proposal boxes
+                - gt_boxes: the ground-truth box that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                Other fields such as "gt_classes" that's included in `targets`.
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
+            matched_idxs, matched_labels = self.proposal_matcher(
+                match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes)
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            # We index all the attributes of targets that start with "gt_"
+            # and have not been added to proposals yet (="gt_classes").
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+
+                for (
+                        trg_name,
+                        trg_value,
+                ) in targets_per_image.get_fields().items():
+                    if trg_name.startswith(
+                            'gt_') and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name,
+                                                trg_value[sampled_targets])
+            else:
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros(
+                        (len(sampled_idxs), 4)))
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append(
+                (gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar('roi_head/num_fg_samples', np.mean(num_fg_samples))
+        storage.put_scalar('roi_head/num_bg_samples', np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        Args:
+            images (ImageList):
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            proposals (list[Instances]): length `N` list of `Instances`s. The i-th
+                `Instances` contains object proposals for the i-th input image,
+                with fields "proposal_boxes" and "objectness_logits".
+            targets (list[Instances], optional): length `N` list of `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+                It may have the following fields:
+                - gt_boxes: the bounding box of each instance.
+                - gt_classes: the label for each instance with a category ranging in [0, #class].
+
+        Returns:
+            results (list[Instances]): length `N` list of `Instances`s containing the
+                detected instances. Returned during inference only; may be []
+                during training.
+            losses (dict[str: Tensor]): mapping from a named loss to a tensor
+                storing the loss. Used during training only.
+        """
+        raise NotImplementedError()
+
+
+class Res5ROIHeads(ROIHeads):
+    """
+    The ROIHeads in a typical "C4" R-CNN model, where the heads share the
+    cropping and the per-region feature computation by a Res5 block.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+
+        assert len(self.in_features) == 1
+
+        # fmt: off
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], )
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+
+        self.pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        self.res5, out_channels = self._build_res5_block(cfg)
+        self.box_predictor = FastRCNNOutputLayers(cfg, out_channels,
+                                                  self.num_classes,
+                                                  self.cls_agnostic_bbox_reg)
+
+    def _build_res5_block(self, cfg):
+        # fmt: off
+        stage_channel_factor = 2**3  # res5 is 8x res2
+        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels = num_groups * width_per_group * stage_channel_factor
+        out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
+        stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+        norm = cfg.MODEL.RESNETS.NORM
+        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
+            'Deformable conv is not yet supported in res5 head.'
+        # fmt: on
+
+        blocks = make_stage(
+            BottleneckBlock,
+            3,
+            first_stride=2,
+            in_channels=out_channels // 2,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            norm=norm,
+            stride_in_1x1=stride_in_1x1,
+        )
+        return nn.Sequential(*blocks), out_channels
+
+    def _shared_roi_transform(self, features, boxes):
+        x = self.pooler(features, boxes)
+        x = self.res5(x)
+        return x
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        proposal_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self._shared_roi_transform(
+            [features[f] for f in self.in_features], proposal_boxes)
+        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
+        pred_class_logits, pred_proposal_deltas = self.box_predictor(
+            feature_pooled)
+        del feature_pooled
+
+        outputs = FastRCNNOutputs(
+            self.box2box_transform,
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            del features
+            losses = outputs.losses()
+            return [], losses
+        else:
+            pred_instances, _ = outputs.inference(
+                self.test_score_thresh,
+                self.test_nms_thresh,
+                self.test_detections_per_img,
+            )
+            return pred_instances, {}
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/init.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/init.py
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
@@ -0,0 +1,81 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import importlib
+import sys
+from collections import OrderedDict
+
+from packaging import version
+
+from modelscope.utils.import_utils import _torch_available
+
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+DETECTRON2_REQUIRED_VERSION = version.parse('0.3')
+
+
+def is_detectron2_version_available():
+    _detectron2_available = importlib.util.find_spec('detectron2') is not None
+    _detectron2_version_available = False
+    if _detectron2_available:
+        _detectron2_version = version.parse(
+            importlib_metadata.version('detectron2'))
+        _detectron2_version_available = (_detectron2_version.major,
+                                         _detectron2_version.minor) == (
+                                             DETECTRON2_REQUIRED_VERSION.major,
+                                             DETECTRON2_REQUIRED_VERSION.minor)
+
+    return _detectron2_version_available
+
+
+TORCH_REQUIRED_VERSION = version.parse('1.11')
+
+
+def is_torch_version_available():
+    _torch_version_available = False
+    if _torch_available:
+        torch_version = version.parse(importlib_metadata.version('torch'))
+        _torch_version_available = (torch_version.major,
+                                    torch_version.minor) == (
+                                        TORCH_REQUIRED_VERSION.major,
+                                        TORCH_REQUIRED_VERSION.minor)
+
+    return _torch_version_available
+
+
+DETECTRON2_IMPORT_ERROR = """
+{0} requires the detectron2-0.3 but it was not found in your environment.
+You can install it from modelscope lib with pip:
+`pip install detectron2==0.3`
+"""
+
+TORCH_VERSION_IMPORT_ERROR = """
+{0} requires the torch-1.11 but it was not found in your environment. You can install it with pip:
+`pip install torch==1.11`
+"""
+
+REQUIREMENTS_MAAPING_VERSION = OrderedDict([
+    ('detectron2-0.3', (is_detectron2_version_available,
+                        DETECTRON2_IMPORT_ERROR)),
+    ('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)),
+])
+
+REQUIREMENTS = ['detectron2-0.3', 'torch-1.11']
+
+
+def requires_version():
+    checks = []
+    for req in REQUIREMENTS:
+        if req in REQUIREMENTS_MAAPING_VERSION:
+            check = REQUIREMENTS_MAAPING_VERSION[req]
+        else:
+            raise NotImplementedError('{} do not supported check'.format(req))
+        checks.append(check)
+
+    failed = [
+        msg.format('DeFRCN') for available, msg in checks if not available()
+    ]
+    if failed:
+        raise ImportError(''.join(failed))
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
@@ -0,0 +1,342 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/data/meta_voc.py
+
+import os
+import xml.etree.ElementTree as ET
+
+import numpy as np
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+
+# PASCAL VOC categories
+PASCAL_VOC_ALL_CATEGORIES = {
+    1: [
+        'aeroplane',
+        'bicycle',
+        'boat',
+        'bottle',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+        'bird',
+        'bus',
+        'cow',
+        'motorbike',
+        'sofa',
+    ],
+    2: [
+        'bicycle',
+        'bird',
+        'boat',
+        'bus',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'motorbike',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+        'aeroplane',
+        'bottle',
+        'cow',
+        'horse',
+        'sofa',
+    ],
+    3: [
+        'aeroplane',
+        'bicycle',
+        'bird',
+        'bottle',
+        'bus',
+        'car',
+        'chair',
+        'cow',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'train',
+        'tvmonitor',
+        'boat',
+        'cat',
+        'motorbike',
+        'sheep',
+        'sofa',
+    ]
+}
+
+PASCAL_VOC_NOVEL_CATEGORIES = {
+    1: ['bird', 'bus', 'cow', 'motorbike', 'sofa'],
+    2: ['aeroplane', 'bottle', 'cow', 'horse', 'sofa'],
+    3: ['boat', 'cat', 'motorbike', 'sheep', 'sofa']
+}
+
+PASCAL_VOC_BASE_CATEGORIES = {
+    1: [
+        'aeroplane',
+        'bicycle',
+        'boat',
+        'bottle',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+    ],
+    2: [
+        'bicycle',
+        'bird',
+        'boat',
+        'bus',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'motorbike',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+    ],
+    3: [
+        'aeroplane',
+        'bicycle',
+        'bird',
+        'bottle',
+        'bus',
+        'car',
+        'chair',
+        'cow',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'train',
+        'tvmonitor',
+    ]
+}
+
+
+def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
+                                classnames: str):
+    """
+    Load Pascal VOC detection annotations to Detectron2 format.
+    Args:
+        dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+        split (str): one of "train", "test", "val", "trainval"
+    """
+    is_shots = 'shot' in name
+    dicts = []
+    if is_shots:
+        fileids = {}
+        # split_dir = os.path.join("datasets", "vocsplit")
+        split_dir = os.path.join(root, 'vocsplit')
+        shot = name.split('_')[-2].split('shot')[0]
+        seed = int(name.split('_seed')[-1])
+        split_dir = os.path.join(split_dir, 'seed{}'.format(seed))
+        for cls in classnames:
+            with PathManager.open(
+                    os.path.join(split_dir,
+                                 'box_{}shot_{}_train.txt'.format(shot,
+                                                                  cls))) as f:
+                fileids_ = np.loadtxt(f, dtype=np.str).tolist()
+                if isinstance(fileids_, str):
+                    fileids_ = [fileids_]
+                fileids_ = [
+                    fid.split('/')[-1].split('.jpg')[0] for fid in fileids_
+                ]
+                fileids[cls] = fileids_
+
+        for cls, fileids_ in fileids.items():
+            dicts_ = []
+            for fileid in fileids_:
+                year = '2012' if '_' in fileid else '2007'
+                # dirname = os.path.join("datasets", "VOC{}".format(year))
+                # anno_file = os.path.join(dirname, "Annotations", fileid + ".xml")
+                # jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+                dir_voc = os.path.join(root, 'VOC{}'.format(year))
+                anno_file = os.path.join(dir_voc, 'Annotations',
+                                         fileid + '.xml')
+                jpeg_file = os.path.join(dir_voc, 'JPEGImages',
+                                         fileid + '.jpg')
+
+                tree = ET.parse(anno_file)
+
+                for obj in tree.findall('object'):
+                    r = {
+                        'file_name': jpeg_file,
+                        'image_id': fileid,
+                        'height': int(tree.findall('./size/height')[0].text),
+                        'width': int(tree.findall('./size/width')[0].text),
+                    }
+                    cls_ = obj.find('name').text
+                    if cls != cls_:
+                        continue
+                    bbox = obj.find('bndbox')
+                    bbox = [
+                        float(bbox.find(x).text)
+                        for x in ['xmin', 'ymin', 'xmax', 'ymax']
+                    ]
+                    bbox[0] -= 1.0
+                    bbox[1] -= 1.0
+
+                    instances = [{
+                        'category_id': classnames.index(cls),
+                        'bbox': bbox,
+                        'bbox_mode': BoxMode.XYXY_ABS,
+                    }]
+                    r['annotations'] = instances
+                    dicts_.append(r)
+            if len(dicts_) > int(shot):
+                dicts_ = np.random.choice(dicts_, int(shot), replace=False)
+            dicts.extend(dicts_)
+    else:
+        with PathManager.open(
+                os.path.join(root, dirname, 'ImageSets', 'Main',
+                             split + '.txt')) as f:
+            fileids = np.loadtxt(f, dtype=np.str)
+
+        for fileid in fileids:
+            anno_file = os.path.join(root, dirname, 'Annotations',
+                                     fileid + '.xml')
+            jpeg_file = os.path.join(root, dirname, 'JPEGImages',
+                                     fileid + '.jpg')
+
+            tree = ET.parse(anno_file)
+
+            r = {
+                'file_name': jpeg_file,
+                'image_id': fileid,
+                'height': int(tree.findall('./size/height')[0].text),
+                'width': int(tree.findall('./size/width')[0].text),
+            }
+            instances = []
+
+            for obj in tree.findall('object'):
+                cls = obj.find('name').text
+                if not (cls in classnames):
+                    continue
+                bbox = obj.find('bndbox')
+                bbox = [
+                    float(bbox.find(x).text)
+                    for x in ['xmin', 'ymin', 'xmax', 'ymax']
+                ]
+                bbox[0] -= 1.0
+                bbox[1] -= 1.0
+
+                instances.append({
+                    'category_id': classnames.index(cls),
+                    'bbox': bbox,
+                    'bbox_mode': BoxMode.XYXY_ABS,
+                })
+            r['annotations'] = instances
+            dicts.append(r)
+
+    return dicts
+
+
+def register_meta_voc(name, root, dirname, split, year, keepclasses, sid):
+    if keepclasses.startswith('base_novel'):
+        thing_classes = PASCAL_VOC_ALL_CATEGORIES[sid]
+    elif keepclasses.startswith('base'):
+        thing_classes = PASCAL_VOC_BASE_CATEGORIES[sid]
+    elif keepclasses.startswith('novel'):
+        thing_classes = PASCAL_VOC_NOVEL_CATEGORIES[sid]
+
+    DatasetCatalog.register(
+        name,
+        lambda: load_filtered_voc_instances(name, root, dirname, split,
+                                            thing_classes),
+    )
+
+    MetadataCatalog.get(name).set(
+        thing_classes=thing_classes,
+        dirname=os.path.join(root, dirname),
+        year=year,
+        split=split,
+        base_classes=PASCAL_VOC_BASE_CATEGORIES[sid],
+        novel_classes=PASCAL_VOC_NOVEL_CATEGORIES[sid],
+    )
+
+
+def register_all_voc(root='datasets'):
+
+    METASPLITS = [
+        ('voc_2007_trainval_base1', 'VOC2007', 'trainval', 'base1', 1),
+        ('voc_2007_trainval_base2', 'VOC2007', 'trainval', 'base2', 2),
+        ('voc_2007_trainval_base3', 'VOC2007', 'trainval', 'base3', 3),
+        ('voc_2012_trainval_base1', 'VOC2012', 'trainval', 'base1', 1),
+        ('voc_2012_trainval_base2', 'VOC2012', 'trainval', 'base2', 2),
+        ('voc_2012_trainval_base3', 'VOC2012', 'trainval', 'base3', 3),
+        ('voc_2007_trainval_all1', 'VOC2007', 'trainval', 'base_novel_1', 1),
+        ('voc_2007_trainval_all2', 'VOC2007', 'trainval', 'base_novel_2', 2),
+        ('voc_2007_trainval_all3', 'VOC2007', 'trainval', 'base_novel_3', 3),
+        ('voc_2012_trainval_all1', 'VOC2012', 'trainval', 'base_novel_1', 1),
+        ('voc_2012_trainval_all2', 'VOC2012', 'trainval', 'base_novel_2', 2),
+        ('voc_2012_trainval_all3', 'VOC2012', 'trainval', 'base_novel_3', 3),
+        ('voc_2007_test_base1', 'VOC2007', 'test', 'base1', 1),
+        ('voc_2007_test_base2', 'VOC2007', 'test', 'base2', 2),
+        ('voc_2007_test_base3', 'VOC2007', 'test', 'base3', 3),
+        ('voc_2007_test_novel1', 'VOC2007', 'test', 'novel1', 1),
+        ('voc_2007_test_novel2', 'VOC2007', 'test', 'novel2', 2),
+        ('voc_2007_test_novel3', 'VOC2007', 'test', 'novel3', 3),
+        ('voc_2007_test_all1', 'VOC2007', 'test', 'base_novel_1', 1),
+        ('voc_2007_test_all2', 'VOC2007', 'test', 'base_novel_2', 2),
+        ('voc_2007_test_all3', 'VOC2007', 'test', 'base_novel_3', 3),
+    ]
+    for prefix in ['all', 'novel']:
+        for sid in range(1, 4):
+            for shot in [1, 2, 3, 5, 10]:
+                for year in [2007, 2012]:
+                    for seed in range(30):
+                        seed = '_seed{}'.format(seed)
+                        name = 'voc_{}_trainval_{}{}_{}shot{}'.format(
+                            year, prefix, sid, shot, seed)
+                        dirname = 'VOC{}'.format(year)
+                        img_file = '{}_{}shot_split_{}_trainval'.format(
+                            prefix, shot, sid)
+                        keepclasses = ('base_novel_{}'.format(sid) if prefix
+                                       == 'all' else 'novel{}'.format(sid))
+                        METASPLITS.append(
+                            (name, dirname, img_file, keepclasses, sid))
+
+    for name, dirname, split, keepclasses, sid in METASPLITS:
+        if name in DatasetCatalog:
+            continue
+
+        year = 2007 if '2007' in name else 2012
+        register_meta_voc(
+            name,
+            root,
+            dirname,
+            split,
+            year,
+            keepclasses,
+            sid,
+        )
+        MetadataCatalog.get(name).evaluator_type = 'pascal_voc'
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -82,6 +82,8 @@ TASK_INPUTS = {
    InputType.IMAGE,
    Tasks.portrait_matting:
    InputType.IMAGE,
+    Tasks.image_fewshot_detection:
+    InputType.IMAGE,

    # image editing task result for a single image
    Tasks.skin_retouching:
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -269,6 +269,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.image_multi_view_depth_estimation: (
        Pipelines.image_multi_view_depth_estimation,
        'damo/cv_casmvs_multi-view-depth-estimation_general'),
+    Tasks.image_fewshot_detection: (
+        Pipelines.image_fewshot_detection,
+        'damo/cv_resnet101_detection_fewshot-defrcn'),
    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
                                 'damo/cv_flow-based-body-reshaping_damo'),
    Tasks.image_face_fusion: (Pipelines.image_face_fusion,
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -83,6 +83,7 @@ if TYPE_CHECKING:
    from .image_mvs_depth_estimation_pipeline import ImageMultiViewDepthEstimationPipeline
    from .panorama_depth_estimation_pipeline import PanoramaDepthEstimationPipeline
    from .ddcolor_image_colorization_pipeline import DDColorImageColorizationPipeline
+    from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline

 else:
    _import_structure = {
@@ -197,6 +198,7 @@ else:
        'ddcolor_image_colorization_pipeline': [
            'DDColorImageColorizationPipeline'
        ],
+        'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
    }

    import sys
--- a/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
+++ b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
@@ -0,0 +1,104 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.image_fewshot_detection,
+    module_name=Pipelines.image_fewshot_detection)
+class ImageDefrcnDetectionPipeline(Pipeline):
+    """ Image DeFRCN few-shot detection Pipeline. Given a image,
+        pipeline will return the detection results on the image.
+        Example:
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> detector = pipeline('image-fewshot-detection', 'damo/cv_resnet101_detection_fewshot-defrcn')
+        >>> detector('/Path/Image')
+           {
+            'scores': [0.8307567834854126, 0.1606406420469284],
+            'labels': ['person', 'dog'],
+            'boxes': [
+                [27.391937255859375, 0.0, 353.0, 500.0],
+                [64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375]
+            ]
+            }
+        >>> #
+        ```
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+        model_path = os.path.join(self.model.model_dir,
+                                  ModelFile.TORCH_MODEL_FILE)
+        self.model.model = self._load_pretrained(
+            self.model.model, model_path, self.model.model_cfg.MODEL.DEVICE)
+
+    def _load_pretrained(self, net, load_path, device='cuda', strict=True):
+
+        load_net = torch.load(load_path, map_location=device)
+        if 'scheduler' in load_net:
+            del load_net['scheduler']
+        if 'optimizer' in load_net:
+            del load_net['optimizer']
+        if 'iteration' in load_net:
+            del load_net['iteration']
+        net.load_state_dict(load_net['model'], strict=strict)
+
+        return net
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float)
+
+        image = img[..., ::-1].copy()  # rgb to bgr
+        tim = torch.Tensor(image).permute(2, 0, 1)
+
+        result = {'image': tim}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input)
+        result = {'data': outputs}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if inputs['data'] is None:
+            outputs = {
+                OutputKeys.SCORES: [],
+                OutputKeys.LABELS: [],
+                OutputKeys.BOXES: []
+            }
+            return outputs
+
+        objects = inputs['data']['instances'].get_fields()
+        labels, bboxes = [], []
+        for label, box in zip(objects['pred_classes'], objects['pred_boxes']):
+            labels.append(self.model.config.model.classes[label])
+            bboxes.append(box.tolist())
+
+        scores = objects['scores'].tolist()
+
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+        return outputs
--- a/modelscope/trainers/cv/init.py
+++ b/modelscope/trainers/cv/init.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
    from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
    from .image_inpainting_trainer import ImageInpaintingTrainer
    from .referring_video_object_segmentation_trainer import ReferringVideoObjectSegmentationTrainer
+    from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer

 else:
    _import_structure = {
@@ -20,7 +21,9 @@ else:
        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
        'image_inpainting_trainer': ['ImageInpaintingTrainer'],
        'referring_video_object_segmentation_trainer':
-        ['ReferringVideoObjectSegmentationTrainer']
+        ['ReferringVideoObjectSegmentationTrainer'],
+        'image_defrcn_fewshot_detection_trainer':
+        ['ImageDefrcnFewshotTrainer']
    }

    import sys
--- a/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
+++ b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
@@ -0,0 +1,316 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/engine/defaults.py
+# https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py
+
+import os
+from typing import Callable, Optional, Union
+
+import torch
+from detectron2.engine import SimpleTrainer, hooks
+from detectron2.evaluation import DatasetEvaluators, verify_results
+from detectron2.utils import comm
+from torch import nn
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.logger import get_logger
+
+
+class DefaultTrainer(SimpleTrainer):
+
+    def __init__(self, model, cfg):
+
+        from collections import OrderedDict
+        from fvcore.nn.precise_bn import get_bn_modules
+        from torch.nn.parallel import DistributedDataParallel
+
+        from detectron2.data.build import build_detection_train_loader, build_detection_test_loader
+        from detectron2.solver.build import build_optimizer, build_lr_scheduler
+        from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+        from detectron2.utils.logger import setup_logger
+
+        setup_logger()
+
+        optimizer = build_optimizer(cfg, model)
+        data_loader = build_detection_train_loader(cfg)
+
+        if comm.get_world_size() > 1:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[comm.get_local_rank()],
+                broadcast_buffers=False,
+                find_unused_parameters=True)
+        super().__init__(model, data_loader, optimizer)
+
+        self.scheduler = build_lr_scheduler(cfg, optimizer)
+
+        self.checkpointer = DetectionCheckpointer(
+            model,
+            cfg.OUTPUT_DIR,
+            optimizer=optimizer,
+            scheduler=self.scheduler,
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        # The checkpoint stores the training iteration that just finished, thus we start
+        # at the next iteration (or iter zero if there's no checkpoint).
+        self.start_iter = (
+            self.checkpointer.resume_or_load(
+                self.cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1)
+            + 1)
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(self.optimizer, self.scheduler),
+            hooks.PreciseBN(
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                build_detection_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        if comm.is_main_process():
+            ret.append(
+                hooks.PeriodicCheckpointer(self.checkpointer,
+                                           cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+        return ret
+
+    def build_writers(self):
+        from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+
+        return [
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, 'metrics.json')),
+            TensorboardXWriter(self.cfg.OUTPUT_DIR),
+        ]
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if hasattr(self, '_last_eval_results') and comm.is_main_process():
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        from detectron2.data import MetadataCatalog
+
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, 'inference')
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type == 'coco':
+            from detectron2.evaluation import COCOEvaluator
+            evaluator_list.append(
+                COCOEvaluator(dataset_name, True, output_folder))
+        if evaluator_type == 'pascal_voc':
+            from detectron2.evaluation import PascalVOCDetectionEvaluator
+            return PascalVOCDetectionEvaluator(dataset_name)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                'no Evaluator for the dataset {} with the type {}'.format(
+                    dataset_name, evaluator_type))
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        from detectron2.engine.defaults import DefaultTrainer as _DefaultTrainer
+        _DefaultTrainer.build_evaluator = cls.build_evaluator
+
+        return _DefaultTrainer.test(cfg, model, evaluators)
+
+
+@TRAINERS.register_module(module_name=Trainers.image_fewshot_detection)
+class ImageDefrcnFewshotTrainer(BaseTrainer):
+
+    def __init__(self,
+                 model: Optional[Union[TorchModel, nn.Module, str]] = None,
+                 cfg_file: Optional[str] = None,
+                 arg_parse_fn: Optional[Callable] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 seed: int = 0,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 **kwargs):
+
+        if isinstance(model, str):
+            self.model_dir = self.get_or_download_model_dir(
+                model, model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        super().__init__(cfg_file, arg_parse_fn)
+
+        if cfg_modify_fn is not None:
+            self.cfg = cfg_modify_fn(self.cfg)
+
+        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
+
+        if isinstance(model, (TorchModel, nn.Module)):
+            self.model = model
+        else:
+            self.model = self.build_model(**kwargs)
+
+        self.model_cfg = self.model.get_model_cfg()
+
+        if 'datasets_train' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['DATASETS.TRAIN', kwargs['datasets_train']])
+        if 'datasets_test' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['DATASETS.TEST', kwargs['datasets_test']])
+        if 'work_dir' in kwargs:
+            self.model_cfg.merge_from_list(['OUTPUT_DIR', kwargs['work_dir']])
+
+        if not os.path.exists(self.model_cfg.OUTPUT_DIR):
+            os.makedirs(self.model_cfg.OUTPUT_DIR)
+
+        self.model_cfg.freeze()
+
+        self.data_dir = kwargs.get('data_dir', None)
+        self.data_type = kwargs.get('data_type', 'pascal_voc')
+
+        self.register_data(self.data_type, self.data_dir)
+
+        self.trainer = DefaultTrainer(self.model, self.model_cfg)
+
+    def train(self, *args, **kwargs):
+        self.trainer.resume_or_load()
+        self.trainer.train()
+
+    def evaluate(self, checkpoint_path: str, *args, **kwargs):
+        from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+
+        DetectionCheckpointer(
+            self.model,
+            save_dir=self.model_cfg.OUTPUT_DIR).resume_or_load(checkpoint_path)
+        metric_values = DefaultTrainer.test(self.model_cfg, self.model)
+        return metric_values
+
+    def build_model(self, *args, **kwargs) -> Union[nn.Module, TorchModel]:
+        model = Model.from_pretrained(self.model_dir, **kwargs)
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    @classmethod
+    def register_data(cls, data_type='pascal_voc', data_dir=None):
+
+        if data_type == 'pascal_voc':
+            from modelscope.models.cv.image_defrcn_fewshot.utils.voc_register import register_all_voc
+            if data_dir:
+                register_all_voc(data_dir)
+            else:
+                register_all_voc()
+        else:
+            raise NotImplementedError(
+                'no {} dataset was registered'.format(data_type))
+
+    @classmethod
+    def model_surgery(cls,
+                      src_path,
+                      save_dir,
+                      data_type='pascal_voc',
+                      method='remove'):
+
+        assert method in ['remove',
+                          'randinit'], '{} not implemented'.format(method)
+
+        def _surgery(param_name, is_weight, tar_size, ckpt):
+            weight_name = param_name + ('.weight' if is_weight else '.bias')
+            pretrained_weight = ckpt['model'][weight_name]
+            prev_cls = pretrained_weight.size(0)
+            if 'cls_score' in param_name:
+                prev_cls -= 1
+            if is_weight:
+                feat_size = pretrained_weight.size(1)
+                new_weight = torch.rand((tar_size, feat_size))
+                torch.nn.init.normal_(new_weight, 0, 0.01)
+            else:
+                new_weight = torch.zeros(tar_size)
+
+            new_weight[:prev_cls] = pretrained_weight[:prev_cls]
+            if 'cls_score' in param_name:
+                new_weight[-1] = pretrained_weight[-1]  # bg class
+            ckpt['model'][weight_name] = new_weight
+
+        if data_type == 'pascal_voc':
+            TAR_SIZE = 20
+            params_name = [
+                'model.roi_heads.box_predictor.cls_score',
+                'model.roi_heads.box_predictor.bbox_pred'
+            ]
+
+            save_name = 'model_reset_' + ('remove' if method == 'remove' else
+                                          'surgery') + '.pth'
+            save_path = os.path.join(save_dir, save_name)
+            os.makedirs(save_dir, exist_ok=True)
+
+            ckpt = torch.load(src_path)
+
+            if 'scheduler' in ckpt:
+                del ckpt['scheduler']
+            if 'optimizer' in ckpt:
+                del ckpt['optimizer']
+            if 'iteration' in ckpt:
+                ckpt['iteration'] = 0
+
+            if method == 'remove':
+                for param_name in params_name:
+                    del ckpt['model'][param_name + '.weight']
+                    if param_name + '.bias' in ckpt['model']:
+                        del ckpt['model'][param_name + '.bias']
+            else:
+                tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4]
+                for idx, (param_name,
+                          tar_size) in enumerate(zip(params_name, tar_sizes)):
+                    _surgery(param_name, True, tar_size, ckpt)
+                    _surgery(param_name, False, tar_size, ckpt)
+
+            torch.save(ckpt, save_path)
+        else:
+            NotImplementedError(
+                '{} dataset does not supported'.format(data_type))
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -46,6 +46,7 @@ class CVTasks(object):

    image_object_detection = 'image-object-detection'
    video_object_detection = 'video-object-detection'
+    image_fewshot_detection = 'image-fewshot-detection'

    image_segmentation = 'image-segmentation'
    semantic_segmentation = 'semantic-segmentation'
--- a/tests/pipelines/test_image_defrcn_fewshot.py
+++ b/tests/pipelines/test_image_defrcn_fewshot.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import subprocess
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageDefrcnFewShotTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        logger.info('start install detectron2-0.3')
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        logger.info('install detectron2-0.3 finished')
+
+        self.task = Tasks.image_fewshot_detection
+        self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
+        self.image = 'data/test/images/image_voc2007_000001.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_defrcn = pipeline(task=self.task, model=model)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_defrcn = pipeline(task=self.task, model=self.model_id)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_defrcn = pipeline(task=self.task)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline_defrcn = pipeline(self.task, model=cache_path)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -49,6 +49,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory or run
  - test_kws_nearfield_trainer.py
  - test_gpt3_text_generation.py
  - test_ddcolor_image_colorization.py
+  - test_image_defrcn_fewshot_trainer.py
  - test_image_deblur_trainer.py

 envs:
--- a/tests/trainers/test_image_defrcn_fewshot_trainer.py
+++ b/tests/trainers/test_image_defrcn_fewshot_trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+
+from modelscope.hub.utils.utils import get_cache_dir
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageDefrcnFewShotTrainer(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
+
+        data_voc = MsDataset.load(
+            dataset_name='VOC_fewshot',
+            namespace='shimin2023',
+            split='train',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        self.data_dir = os.path.join(
+            data_voc.config_kwargs['split_config']['train'], 'data')
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+
+        split = 1
+        kwargs = dict(
+            model=self.model_id,
+            data_dir=self.data_dir,
+            work_dir=self.tmp_dir,
+            model_weights=os.path.join(get_cache_dir(), self.model_id,
+                                       'ImageNetPretrained/MSRA/R-101.pkl'),
+            data_type='pascal_voc',
+            config_path='defrcn_det_r101_base{}.yaml'.format(split),
+            datasets_train=('voc_2007_trainval_base{}'.format(split),
+                            'voc_2012_trainval_base{}'.format(split)),
+            datasets_test=('voc_2007_test_base{}'.format(split), ))
+        trainer = build_trainer(
+            name=Trainers.image_fewshot_detection, default_args=kwargs)
+        trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn('metrics.json', results_files)
+        self.assertIn('model_final.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()