From ff69439c4f48bbd1ca3e3f81a3c921925f8e3ca5 Mon Sep 17 00:00:00 2001
From: "ryan.yy" <ryan.yy@alibaba-inc.com>
Date: Mon, 10 Oct 2022 17:42:41 +0800
Subject: [PATCH 01/57] [to #42322933]add image_body_reshaping code        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10217723

    * add image_body_reshaping code
---
 data/test/images/image_body_reshaping.jpg     |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/image_body_reshaping/__init__.py       |  20 +
 .../image_body_reshaping.py                   | 128 +++++
 .../models/cv/image_body_reshaping/model.py   | 189 +++++++
 .../cv/image_body_reshaping/person_info.py    | 339 ++++++++++++
 .../pose_estimator/__init__.py                |   0
 .../pose_estimator/body.py                    | 272 ++++++++++
 .../pose_estimator/model.py                   | 141 +++++
 .../pose_estimator/util.py                    |  33 ++
 .../cv/image_body_reshaping/slim_utils.py     | 507 ++++++++++++++++++
 modelscope/outputs.py                         |   1 +
 modelscope/pipelines/builder.py               |   2 +
 .../cv/image_body_reshaping_pipeline.py       |  40 ++
 modelscope/utils/constant.py                  |   2 +-
 requirements/cv.txt                           |   1 +
 tests/pipelines/test_image_body_reshaping.py  |  58 ++
 17 files changed, 1737 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/image_body_reshaping.jpg
 create mode 100644 modelscope/models/cv/image_body_reshaping/__init__.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/model.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/person_info.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/slim_utils.py
 create mode 100644 modelscope/pipelines/cv/image_body_reshaping_pipeline.py
 create mode 100644 tests/pipelines/test_image_body_reshaping.py

diff --git a/data/test/images/image_body_reshaping.jpg b/data/test/images/image_body_reshaping.jpg
new file mode 100644
index 00000000..d78acb8f
--- /dev/null
+++ b/data/test/images/image_body_reshaping.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d
+size 1127557
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 28804ce6..1b8c4720 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -43,6 +43,7 @@ class Models(object):
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
+    image_body_reshaping = 'image-body-reshaping'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -187,6 +188,7 @@ class Pipelines(object):
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
+    image_body_reshaping = 'flow-based-body-reshaping'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
diff --git a/modelscope/models/cv/image_body_reshaping/__init__.py b/modelscope/models/cv/image_body_reshaping/__init__.py
new file mode 100644
index 00000000..a04f110d
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_body_reshaping import ImageBodyReshaping
+
+else:
+    _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
new file mode 100644
index 00000000..4aed8d98
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .model import FlowGenerator
+from .person_info import PersonInfo
+from .pose_estimator.body import Body
+from .slim_utils import image_warp_grid1, resize_on_long_side
+
+logger = get_logger()
+
+__all__ = ['ImageBodyReshaping']
+
+
+@MODELS.register_module(
+    Tasks.image_body_reshaping, module_name=Models.image_body_reshaping)
+class ImageBodyReshaping(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image body reshaping model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+
+        self.degree = 1.0
+        self.reshape_model = FlowGenerator(n_channels=16).to(self.device)
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoints = torch.load(model_path, map_location=torch.device('cpu'))
+        self.reshape_model.load_state_dict(
+            checkpoints['state_dict'], strict=True)
+        self.reshape_model.eval()
+        logger.info('load body reshaping model done')
+
+        pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth')
+        self.pose_esti = Body(pose_model_ckpt, self.device)
+        logger.info('load pose model done')
+
+    def pred_joints(self, img):
+        if img is None:
+            return None
+        small_src, resize_scale = resize_on_long_side(img, 300)
+        body_joints = self.pose_esti(small_src)
+
+        if body_joints.shape[0] >= 1:
+            body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale
+
+        return body_joints
+
+    def pred_flow(self, img):
+
+        body_joints = self.pred_joints(img)
+        small_size = 1200
+
+        if img.shape[0] > small_size or img.shape[1] > small_size:
+            _img, _scale = resize_on_long_side(img, small_size)
+            body_joints[:, :, :2] = body_joints[:, :, :2] * _scale
+        else:
+            _img = img
+
+        # We only reshape one person
+        if body_joints.shape[0] < 1 or body_joints.shape[0] > 1:
+            return None
+
+        person = PersonInfo(body_joints[0])
+
+        with torch.no_grad():
+            person_pred = person.pred_flow(_img, self.reshape_model,
+                                           self.device)
+
+        flow = np.dstack((person_pred['rDx'], person_pred['rDy']))
+
+        scale = img.shape[0] * 1.0 / flow.shape[0]
+
+        flow = cv2.resize(flow, (img.shape[1], img.shape[0]))
+        flow *= scale
+
+        return flow
+
+    def warp(self, src_img, flow):
+
+        X_flow = flow[..., 0]
+        Y_flow = flow[..., 1]
+
+        X_flow = np.ascontiguousarray(X_flow)
+        Y_flow = np.ascontiguousarray(Y_flow)
+
+        pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0)
+        return pred
+
+    def inference(self, img):
+        img = img.cpu().numpy()
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        flow = self.pred_flow(img)
+
+        if flow is None:
+            return img
+
+        assert flow.shape[:2] == img.shape[:2]
+
+        mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8)
+        mag -= 3
+        mag[mag <= 0] = 0
+
+        x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
+        flow = np.dstack((x, y))
+
+        flow *= self.degree
+        pred = self.warp(img, flow)
+        out_img = np.clip(pred, 0, 255)
+        logger.info('model inference done')
+
+        return out_img.astype(np.uint8)
diff --git a/modelscope/models/cv/image_body_reshaping/model.py b/modelscope/models/cv/image_body_reshaping/model.py
new file mode 100644
index 00000000..174428a1
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/model.py
@@ -0,0 +1,189 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, in_ch, out_ch):
+        super(ConvLayer, self).__init__()
+
+        self.conv = nn.Sequential(
+            nn.ReflectionPad2d(1),
+            nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0),
+            nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class SASA(nn.Module):
+
+    def __init__(self, in_dim):
+        super(SASA, self).__init__()
+        self.chanel_in = in_dim
+
+        self.query_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
+        self.key_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
+        self.value_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.mag_conv = nn.Conv2d(
+            in_channels=5, out_channels=in_dim // 32, kernel_size=1)
+
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        self.softmax = nn.Softmax(dim=-1)  #
+        self.sigmoid = nn.Sigmoid()
+
+    def structure_encoder(self, paf_mag, target_height, target_width):
+        torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True)
+        torso_mask = torch.clamp(torso_mask, 0, 1)
+
+        arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True)
+        arms_mask = torch.clamp(arms_mask, 0, 1)
+
+        legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True)
+        legs_mask = torch.clamp(legs_mask, 0, 1)
+
+        fg_mask = paf_mag[:, 12, :, :].unsqueeze(1)
+        bg_mask = 1 - fg_mask
+        Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask),
+                      dim=1)
+        Y = F.interpolate(Y, size=(target_height, target_width), mode='area')
+        return Y
+
+    def forward(self, X, PAF_mag):
+        """extract self-attention features.
+        Args:
+            X : input feature maps( B x C x H x W)
+            PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity
+
+        Returns:
+            out : self attention value + input feature
+            Y: B X N X N (N is Width*Height)
+        """
+
+        m_batchsize, C, height, width = X.size()
+
+        Y = self.structure_encoder(PAF_mag, height, width)
+
+        connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1,
+                                                      width * height)
+        affinity = torch.bmm(
+            connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec)
+        affinity_centered = affinity - torch.mean(affinity)
+        affinity_sigmoid = self.sigmoid(affinity_centered)
+
+        proj_query = self.query_conv(X).view(m_batchsize, -1,
+                                             width * height).permute(0, 2, 1)
+        proj_key = self.key_conv(X).view(m_batchsize, -1, width * height)
+        selfatten_map = torch.bmm(proj_query, proj_key)
+        selfatten_centered = selfatten_map - torch.mean(
+            selfatten_map)  # centering
+        selfatten_sigmoid = self.sigmoid(selfatten_centered)
+
+        SASA_map = selfatten_sigmoid * affinity_sigmoid
+
+        proj_value = self.value_conv(X).view(m_batchsize, -1, width * height)
+
+        out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1))
+        out = out.view(m_batchsize, C, height, width)
+
+        out = self.gamma * out + X
+        return out, Y
+
+
+class FlowGenerator(nn.Module):
+
+    def __init__(self, n_channels, deep_supervision=False):
+        super(FlowGenerator, self).__init__()
+        self.deep_supervision = deep_supervision
+
+        self.Encoder = nn.Sequential(
+            ConvLayer(n_channels, 64),
+            ConvLayer(64, 64),
+            nn.MaxPool2d(2),
+            ConvLayer(64, 128),
+            ConvLayer(128, 128),
+            nn.MaxPool2d(2),
+            ConvLayer(128, 256),
+            ConvLayer(256, 256),
+            nn.MaxPool2d(2),
+            ConvLayer(256, 512),
+            ConvLayer(512, 512),
+            nn.MaxPool2d(2),
+            ConvLayer(512, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+        )
+
+        self.SASA = SASA(in_dim=1024)
+
+        self.Decoder = nn.Sequential(
+            ConvLayer(1024, 1024),
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            ConvLayer(1024, 512),
+            ConvLayer(512, 512),
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            ConvLayer(512, 256),
+            ConvLayer(256, 256),
+            ConvLayer(256, 128),
+            ConvLayer(128, 64),
+            ConvLayer(64, 32),
+            nn.Conv2d(32, 2, kernel_size=1, padding=0),
+            nn.Tanh(),
+            nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
+        )
+
+        dilation_ksize = 17
+        self.dilation = torch.nn.MaxPool2d(
+            kernel_size=dilation_ksize,
+            stride=1,
+            padding=int((dilation_ksize - 1) / 2))
+
+    def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2):
+        n, c, h, w = x.size()
+        yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
+        xv = xv.float() / (w - 1) * 2.0 - 1
+        yv = yv.float() / (h - 1) * 2.0 - 1
+        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
+        grid = grid.to(flow.device)
+        grid_x = grid + 2 * flow * coff
+        warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
+        return warp_x
+
+    def forward(self, img, skeleton_map, coef=0.2):
+        """extract self-attention features.
+        Args:
+            img : input numpy image
+            skeleton_map : skeleton map of input image
+            coef: warp degree
+
+        Returns:
+            warp_x : warped image
+            flow: predicted flow
+        """
+
+        img_concat = torch.cat((img, skeleton_map), dim=1)
+        X = self.Encoder(img_concat)
+
+        _, _, height, width = X.size()
+
+        # directly get PAF magnitude from skeleton maps via dilation
+        PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5)
+
+        out, Y = self.SASA(X, PAF_mag)
+        flow = self.Decoder(out)
+
+        flow = flow.permute(0, 2, 3, 1)  # [n, 2, h, w] ==> [n, h, w, 2]
+
+        warp_x = self.warp(img, flow, coff=coef)
+        warp_x = torch.clamp(warp_x, min=-1.0, max=1.0)
+
+        return warp_x, flow
diff --git a/modelscope/models/cv/image_body_reshaping/person_info.py b/modelscope/models/cv/image_body_reshaping/person_info.py
new file mode 100644
index 00000000..509a2ce3
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/person_info.py
@@ -0,0 +1,339 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+
+import cv2
+import numpy as np
+import torch
+
+from .slim_utils import (enlarge_box_tblr, gen_skeleton_map,
+                         get_map_fusion_map_cuda, get_mask_bbox,
+                         resize_on_long_side)
+
+
+class PersonInfo(object):
+
+    def __init__(self, joints):
+        self.joints = joints
+        self.flow = None
+        self.pad_boder = False
+        self.height_expand = 0
+        self.width_expand = 0
+        self.coeff = 0.2
+        self.network_input_W = 256
+        self.network_input_H = 256
+        self.divider = 20
+        self.flow_scales = ['upper_2']
+
+    def update_attribute(self, pad_boder, height_expand, width_expand):
+        self.pad_boder = pad_boder
+        self.height_expand = height_expand
+        self.width_expand = width_expand
+        if pad_boder:
+            self.joints[:, 0] += width_expand
+            self.joints[:, 1] += height_expand
+
+    def pred_flow(self, img, flow_net, device):
+        with torch.no_grad():
+            if img is None:
+                print('image is none')
+                self.flow = None
+
+            if len(img.shape) == 2:
+                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+            if self.pad_boder:
+                height_expand = self.height_expand
+                width_expand = self.width_expand
+                pad_img = cv2.copyMakeBorder(
+                    img,
+                    height_expand,
+                    height_expand,
+                    width_expand,
+                    width_expand,
+                    cv2.BORDER_CONSTANT,
+                    value=(127, 127, 127))
+
+            else:
+                height_expand = 0
+                width_expand = 0
+                pad_img = img.copy()
+
+            canvas = np.zeros(
+                shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32)
+
+            self.human_joint_box = self.__joint_to_body_box()
+
+            self.human_box = enlarge_box_tblr(
+                self.human_joint_box, pad_img, ratio=0.25)
+            human_box_height = self.human_box[1] - self.human_box[0]
+            human_box_width = self.human_box[3] - self.human_box[2]
+
+            self.leg_joint_box = self.__joint_to_leg_box()
+            self.leg_box = enlarge_box_tblr(
+                self.leg_joint_box, pad_img, ratio=0.25)
+
+            self.arm_joint_box = self.__joint_to_arm_box()
+            self.arm_box = enlarge_box_tblr(
+                self.arm_joint_box, pad_img, ratio=0.1)
+
+            x_flows = []
+            y_flows = []
+            multi_bbox = []
+
+            for scale in self.flow_scales:  # better for metric
+                scale_value = float(scale.split('_')[-1])
+
+                arm_box = copy.deepcopy(self.arm_box)
+
+                if arm_box[0] is None:
+                    arm_box = self.human_box
+
+                arm_box_height = arm_box[1] - arm_box[0]
+                arm_box_width = arm_box[3] - arm_box[2]
+
+                roi_bbox = None
+
+                if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1:
+                    roi_bbox = self.human_box
+                else:
+                    arm_box = enlarge_box_tblr(
+                        arm_box, pad_img, ratio=scale_value)
+                    if scale == 'upper_0.2':
+                        arm_box[0] = min(arm_box[0], int(self.joints[0][1]))
+                    if scale.startswith('upper'):
+                        roi_bbox = [
+                            max(self.human_box[0], arm_box[0]),
+                            min(self.human_box[1], arm_box[1]),
+                            max(self.human_box[2], arm_box[2]),
+                            min(self.human_box[3], arm_box[3])
+                        ]
+                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
+                                3] - roi_bbox[2] < 1:
+                            continue
+
+                    elif scale.startswith('lower'):
+                        roi_bbox = [
+                            max(self.human_box[0], self.leg_box[0]),
+                            min(self.human_box[1], self.leg_box[1]),
+                            max(self.human_box[2], self.leg_box[2]),
+                            min(self.human_box[3], self.leg_box[3])
+                        ]
+
+                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
+                                3] - roi_bbox[2] < 1:
+                            continue
+
+                skel_map, roi_bbox = gen_skeleton_map(
+                    self.joints, 'depth', input_roi_box=roi_bbox)
+
+                if roi_bbox is None:
+                    continue
+
+                if skel_map.dtype != np.float32:
+                    skel_map = skel_map.astype(np.float32)
+
+                skel_map -= 1.0  # [0,2] ->[-1,1]
+
+                multi_bbox.append(roi_bbox)
+
+                roi_bbox_height = roi_bbox[1] - roi_bbox[0]
+                roi_bbox_width = roi_bbox[3] - roi_bbox[2]
+
+                assert skel_map.shape[0] == roi_bbox_height
+                assert skel_map.shape[1] == roi_bbox_width
+                roi_height_pad = roi_bbox_height // self.divider
+                roi_width_pad = roi_bbox_width // self.divider
+                paded_roi_h = roi_bbox_height + 2 * roi_height_pad
+                paded_roi_w = roi_bbox_width + 2 * roi_width_pad
+
+                roi_height_pad_joint = skel_map.shape[0] // self.divider
+                roi_width_pad_joint = skel_map.shape[1] // self.divider
+                skel_map = np.pad(
+                    skel_map,
+                    ((roi_height_pad_joint, roi_height_pad_joint),
+                     (roi_width_pad_joint, roi_width_pad_joint), (0, 0)),
+                    'constant',
+                    constant_values=-1)
+
+                skel_map_resized = cv2.resize(
+                    skel_map, (self.network_input_W, self.network_input_H))
+
+                skel_map_resized[skel_map_resized < 0] = -1.0
+                skel_map_resized[skel_map_resized > -0.5] = 1.0
+                skel_map_transformed = torch.from_numpy(
+                    skel_map_resized.transpose((2, 0, 1)))
+
+                roi_npy = pad_img[roi_bbox[0]:roi_bbox[1],
+                                  roi_bbox[2]:roi_bbox[3], :].copy()
+                if roi_npy.dtype != np.float32:
+                    roi_npy = roi_npy.astype(np.float32)
+
+                roi_npy = np.pad(roi_npy,
+                                 ((roi_height_pad, roi_height_pad),
+                                  (roi_width_pad, roi_width_pad), (0, 0)),
+                                 'edge')
+
+                roi_npy = roi_npy[:, :, ::-1]
+
+                roi_npy = cv2.resize(
+                    roi_npy, (self.network_input_W, self.network_input_H))
+
+                roi_npy *= 1.0 / 255
+                roi_npy -= 0.5
+                roi_npy *= 2
+
+                rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1)))
+
+                rgb_tensor = rgb_tensor.unsqueeze(0).to(device)
+                skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device)
+                warped_img_val, flow_field_val = flow_net(
+                    rgb_tensor, skel_map_tensor
+                )  # inference, connectivity_mask [1,12,16,16]
+                flow_field_val = flow_field_val.detach().squeeze().cpu().numpy(
+                )
+
+                flow_field_val = cv2.resize(
+                    flow_field_val, (paded_roi_w, paded_roi_h),
+                    interpolation=cv2.INTER_LINEAR)
+                flow_field_val[..., 0] = flow_field_val[
+                    ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff
+                flow_field_val[..., 1] = flow_field_val[
+                    ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff
+
+                # remove pad areas
+                flow_field_val = flow_field_val[
+                    roi_height_pad:flow_field_val.shape[0] - roi_height_pad,
+                    roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :]
+
+                diffuse_width = max(roi_bbox_width // 3, 1)
+                diffuse_height = max(roi_bbox_height // 3, 1)
+                assert roi_bbox_width == flow_field_val.shape[1]
+                assert roi_bbox_height == flow_field_val.shape[0]
+
+                origin_flow = np.zeros(
+                    (pad_img.shape[0] + 2 * diffuse_height,
+                     pad_img.shape[1] + 2 * diffuse_width, 2),
+                    dtype=np.float32)
+
+                flow_field_val = np.pad(flow_field_val,
+                                        ((diffuse_height, diffuse_height),
+                                         (diffuse_width, diffuse_width),
+                                         (0, 0)), 'linear_ramp')
+
+                origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height,
+                            roi_bbox[2]:roi_bbox[3]
+                            + 2 * diffuse_width] = flow_field_val
+
+                origin_flow = origin_flow[diffuse_height:-diffuse_height,
+                                          diffuse_width:-diffuse_width, :]
+
+                x_flows.append(origin_flow[..., 0])
+                y_flows.append(origin_flow[..., 1])
+
+            if len(x_flows) == 0:
+                return {
+                    'rDx': np.zeros(canvas.shape[:2], dtype=np.float32),
+                    'rDy': np.zeros(canvas.shape[:2], dtype=np.float32),
+                    'multi_bbox': multi_bbox,
+                    'x_fusion_map':
+                    np.ones(canvas.shape[:2], dtype=np.float32),
+                    'y_fusion_map':
+                    np.ones(canvas.shape[:2], dtype=np.float32)
+                }
+            else:
+                origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow(
+                    x_flows, y_flows, device=device)
+
+            return {
+                'rDx': origin_rDx,
+                'rDy': origin_rDy,
+                'multi_bbox': multi_bbox,
+                'x_fusion_map': x_fusion_map,
+                'y_fusion_map': y_fusion_map
+            }
+
+    @staticmethod
+    def blend_multiscale_flow(x_flows, y_flows, device=None):
+        scale_num = len(x_flows)
+        if scale_num == 1:
+            return x_flows[0], y_flows[0], np.ones_like(
+                x_flows[0]), np.ones_like(x_flows[0])
+
+        origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]),
+                              dtype=np.float32)
+        origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]),
+                              dtype=np.float32)
+
+        x_fusion_map, x_acc_map = get_map_fusion_map_cuda(
+            x_flows, 1, device=device)
+        y_fusion_map, y_acc_map = get_map_fusion_map_cuda(
+            y_flows, 1, device=device)
+
+        x_flow_map = 1.0 / x_fusion_map
+        y_flow_map = 1.0 / y_fusion_map
+
+        all_acc_map = x_acc_map + y_acc_map
+        all_acc_map = all_acc_map.astype(np.uint8)
+        roi_box = get_mask_bbox(all_acc_map, threshold=1)
+
+        if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[
+                3] - roi_box[2] <= 0:
+            roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]]
+
+        roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1],
+                                    roi_box[2]:roi_box[3]]
+        roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1],
+                                    roi_box[2]:roi_box[3]]
+
+        roi_width = roi_x_flow_map.shape[1]
+        roi_height = roi_x_flow_map.shape[0]
+
+        roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320)
+        roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320)
+
+        roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55))
+        roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55))
+
+        roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height))
+        roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height))
+
+        x_flow_map[roi_box[0]:roi_box[1],
+                   roi_box[2]:roi_box[3]] = roi_x_flow_map
+        y_flow_map[roi_box[0]:roi_box[1],
+                   roi_box[2]:roi_box[3]] = roi_y_flow_map
+
+        for i in range(scale_num):
+            origin_rDx += x_flows[i]
+            origin_rDy += y_flows[i]
+
+        origin_rDx *= x_flow_map
+        origin_rDy *= y_flow_map
+
+        return origin_rDx, origin_rDy, x_flow_map, y_flow_map
+
+    def __joint_to_body_box(self):
+        joint_left = int(np.min(self.joints, axis=0)[0])
+        joint_right = int(np.max(self.joints, axis=0)[0])
+        joint_top = int(np.min(self.joints, axis=0)[1])
+        joint_bottom = int(np.max(self.joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
+
+    def __joint_to_leg_box(self):
+        leg_joints = self.joints[8:, :]
+        if np.max(leg_joints, axis=0)[2] < 0.05:
+            return [0, 0, 0, 0]
+        joint_left = int(np.min(leg_joints, axis=0)[0])
+        joint_right = int(np.max(leg_joints, axis=0)[0])
+        joint_top = int(np.min(leg_joints, axis=0)[1])
+        joint_bottom = int(np.max(leg_joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
+
+    def __joint_to_arm_box(self):
+        arm_joints = self.joints[2:8, :]
+        if np.max(arm_joints, axis=0)[2] < 0.05:
+            return [0, 0, 0, 0]
+        joint_left = int(np.min(arm_joints, axis=0)[0])
+        joint_right = int(np.max(arm_joints, axis=0)[0])
+        joint_top = int(np.min(arm_joints, axis=0)[1])
+        joint_bottom = int(np.max(arm_joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
new file mode 100644
index 00000000..45b02724
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
@@ -0,0 +1,272 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+
+import math
+
+import cv2
+import numpy as np
+import torch
+from scipy.ndimage.filters import gaussian_filter
+
+from .model import BodyposeModel
+from .util import pad_rightdown_corner, transfer
+
+
+class Body(object):
+
+    def __init__(self, model_path, device):
+        self.model = BodyposeModel().to(device)
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+    def __call__(self, oriImg):
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        bodyparts = 18
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(
+                oriImg, (0, 0),
+                fx=scale,
+                fy=scale,
+                interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = pad_rightdown_corner(
+                imageToTest, stride, padValue)
+            im = np.transpose(
+                np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
+                                   (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(
+                heatmap, (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0]
+                              - pad[2], :imageToTest_padded.shape[1]
+                              - pad[3], :]
+            heatmap = cv2.resize(
+                heatmap, (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC)
+
+            paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
+                               (1, 2, 0))  # output 0 is PAFs
+            paf = cv2.resize(
+                paf, (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC)
+            paf = paf[:imageToTest_padded.shape[0]
+                      - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            paf = cv2.resize(
+                paf, (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC)
+
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += +paf / len(multiplier)
+
+        all_peaks = []
+        peak_counter = 0
+
+        for part in range(bodyparts):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+
+            peaks_binary = np.logical_and.reduce(
+                (one_heatmap >= map_left, one_heatmap >= map_right,
+                 one_heatmap >= map_up, one_heatmap >= map_down,
+                 one_heatmap > thre1))
+            peaks = list(
+                zip(np.nonzero(peaks_binary)[1],
+                    np.nonzero(peaks_binary)[0]))  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [
+                peaks_with_score[i] + (peak_id[i], )
+                for i in range(len(peak_id))
+            ]
+
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
+                   [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
+                   [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
+        # the middle joints heatmap correpondence
+        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
+                  [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
+                  [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
+                  [45, 46]]
+
+        connection_all = []
+        special_k = []
+        mid_num = 10
+
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            if (nA != 0 and nB != 0):
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+
+                        startend = list(
+                            zip(
+                                np.linspace(
+                                    candA[i][0], candB[j][0], num=mid_num),
+                                np.linspace(
+                                    candA[i][1], candB[j][1], num=mid_num)))
+
+                        vec_x = np.array([
+                            score_mid[int(round(startend[item][1])),
+                                      int(round(startend[item][0])), 0]
+                            for item in range(len(startend))
+                        ])
+                        vec_y = np.array([
+                            score_mid[int(round(startend[item][1])),
+                                      int(round(startend[item][0])), 1]
+                            for item in range(len(startend))
+                        ])
+
+                        score_midpts = np.multiply(
+                            vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+                        temp1 = sum(score_midpts) / len(score_midpts)
+                        temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0)
+                        score_with_dist_prior = temp1 + temp2
+                        criterion1 = len(np.nonzero(
+                            score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append([
+                                i, j, score_with_dist_prior,
+                                score_with_dist_prior + candA[i][2]
+                                + candB[j][2]
+                            ])
+
+                connection_candidate = sorted(
+                    connection_candidate, key=lambda x: x[2], reverse=True)
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if (i not in connection[:, 3]
+                            and j not in connection[:, 4]):
+                        connection = np.vstack(
+                            [connection, [candA[i][3], candB[j][3], s, i, j]])
+                        if (len(connection) >= min(nA, nB)):
+                            break
+
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array(
+            [item for sublist in all_peaks for item in sublist])
+
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if subset[j][indexA] == partAs[i] or subset[j][
+                                indexB] == partBs[i]:
+                            subset_idx[found] = j
+                            found += 1
+
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        tmp1 = (subset[j1] >= 0).astype(int)
+                        tmp2 = (subset[j2] >= 0).astype(int)
+                        membership = (tmp1 + tmp2)[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += (subset[j2][:-2] + 1)
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = sum(
+                            candidate[connection_all[k][i, :2].astype(int),
+                                      2]) + connection_all[k][i][2]
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+
+        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+        # candidate: x, y, score, id
+        count = subset.shape[0]
+        joints = np.zeros(shape=(count, bodyparts, 3))
+
+        for i in range(count):
+            for j in range(bodyparts):
+                joints[i, j, :3] = candidate[int(subset[i, j]), :3]
+                confidence = 1.0 if subset[i, j] >= 0 else 0.0
+                joints[i, j, 2] *= confidence
+        return joints
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
new file mode 100644
index 00000000..12f6e84d
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
@@ -0,0 +1,141 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(
+                in_channels=v[0],
+                out_channels=v[1],
+                kernel_size=v[2],
+                stride=v[3],
+                padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
+
+    return nn.Sequential(OrderedDict(layers))
+
+
+class BodyposeModel(nn.Module):
+
+    def __init__(self):
+        super(BodyposeModel, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = [
+            'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
+            'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
+            'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
+            'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
+        ]
+        blocks = {}
+        block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
+                              ('conv1_2', [64, 64, 3, 1, 1]),
+                              ('pool1_stage1', [2, 2, 0]),
+                              ('conv2_1', [64, 128, 3, 1, 1]),
+                              ('conv2_2', [128, 128, 3, 1, 1]),
+                              ('pool2_stage1', [2, 2, 0]),
+                              ('conv3_1', [128, 256, 3, 1, 1]),
+                              ('conv3_2', [256, 256, 3, 1, 1]),
+                              ('conv3_3', [256, 256, 3, 1, 1]),
+                              ('conv3_4', [256, 256, 3, 1, 1]),
+                              ('pool3_stage1', [2, 2, 0]),
+                              ('conv4_1', [256, 512, 3, 1, 1]),
+                              ('conv4_2', [512, 512, 3, 1, 1]),
+                              ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+                              ('conv4_4_CPM', [256, 128, 3, 1, 1])])
+
+        # Stage 1
+        block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
+
+        block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+
+        self.model0 = make_layers(block0, no_relu_layers)
+
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([
+                ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+            ])
+
+            blocks['block%d_2' % i] = OrderedDict([
+                ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+            ])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+
+    def forward(self, x):
+
+        out1 = self.model0(x)
+
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+
+        return out6_1, out6_2
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
new file mode 100644
index 00000000..13a42074
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
@@ -0,0 +1,33 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+import numpy as np
+
+
+def pad_rightdown_corner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+
+    pad = 4 * [None]
+    pad[0] = 0  # up
+    pad[1] = 0  # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
+
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+    return img_padded, pad
+
+
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(
+            weights_name.split('.')[1:])]
+    return transfered_model_weights
diff --git a/modelscope/models/cv/image_body_reshaping/slim_utils.py b/modelscope/models/cv/image_body_reshaping/slim_utils.py
new file mode 100644
index 00000000..23d5a741
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py
@@ -0,0 +1,507 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os
+import random
+
+import cv2
+import numba
+import numpy as np
+import torch
+
+
+def resize_on_long_side(img, long_side=800):
+    src_height = img.shape[0]
+    src_width = img.shape[1]
+
+    if src_height > src_width:
+        scale = long_side * 1.0 / src_height
+        _img = cv2.resize(
+            img, (int(src_width * scale), long_side),
+            interpolation=cv2.INTER_LINEAR)
+    else:
+        scale = long_side * 1.0 / src_width
+        _img = cv2.resize(
+            img, (long_side, int(src_height * scale)),
+            interpolation=cv2.INTER_LINEAR)
+
+    return _img, scale
+
+
+def point_in_box(pt, box):
+    pt_x = pt[0]
+    pt_y = pt[1]
+
+    if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[
+            1] and pt_y <= box[1] + box[3]:
+        return True
+    else:
+        return False
+
+
+def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True):
+    if roi_bbox is None or None in roi_bbox:
+        return [None, None, None, None]
+
+    top = roi_bbox[0]
+    bottom = roi_bbox[1]
+    left = roi_bbox[2]
+    right = roi_bbox[3]
+
+    roi_width = roi_bbox[3] - roi_bbox[2]
+    roi_height = roi_bbox[1] - roi_bbox[0]
+    right = left + roi_width
+    bottom = top + roi_height
+
+    long_side = roi_width if roi_width > roi_height else roi_height
+
+    if use_long_side:
+        new_left = left - int(long_side * ratio)
+    else:
+        new_left = left - int(roi_width * ratio)
+    new_left = 1 if new_left < 0 else new_left
+
+    if use_long_side:
+        new_top = top - int(long_side * ratio)
+    else:
+        new_top = top - int(roi_height * ratio)
+    new_top = 1 if new_top < 0 else new_top
+
+    if use_long_side:
+        new_right = right + int(long_side * ratio)
+    else:
+        new_right = right + int(roi_width * ratio)
+    new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right
+
+    if use_long_side:
+        new_bottom = bottom + int(long_side * ratio)
+    else:
+        new_bottom = bottom + int(roi_height * ratio)
+    new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom
+
+    bbox = [new_top, new_bottom, new_left, new_right]
+    return bbox
+
+
+def gen_PAF(image, joints):
+
+    assert joints.shape[0] == 18
+    assert joints.shape[1] == 3
+
+    org_h = image.shape[0]
+    org_w = image.shape[1]
+    small_image, resize_scale = resize_on_long_side(image, 120)
+
+    joints[:, :2] = joints[:, :2] * resize_scale
+
+    joint_left = int(np.min(joints, axis=0)[0])
+    joint_right = int(np.max(joints, axis=0)[0])
+    joint_top = int(np.min(joints, axis=0)[1])
+    joint_bottom = int(np.max(joints, axis=0)[1])
+
+    limb_width = min(
+        abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6
+
+    if limb_width % 2 == 0:
+        limb_width += 1
+    kernel_size = limb_width
+
+    part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12),
+                   (12, 13), (8, 9), (9, 10)]
+
+    map_list = []
+    mask_list = []
+    PAF_all = np.zeros(
+        shape=(small_image.shape[0], small_image.shape[1], 2),
+        dtype=np.float32)
+    for c, pair in enumerate(part_orders):
+        idx_a_name = pair[0]
+        idx_b_name = pair[1]
+
+        jointa = joints[idx_a_name]
+        jointb = joints[idx_b_name]
+
+        confidence_threshold = 0.05
+        if jointa[2] > confidence_threshold and jointb[
+                2] > confidence_threshold:
+            canvas = np.zeros(
+                shape=(small_image.shape[0], small_image.shape[1]),
+                dtype=np.uint8)
+
+            canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])),
+                              (int(jointb[0]), int(jointb[1])),
+                              (255, 255, 255), 5)
+
+            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                               (kernel_size, kernel_size))
+
+            canvas = cv2.dilate(canvas, kernel, 1)
+            canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0)
+            canvas = canvas.astype(np.float32) / 255
+            PAF = np.zeros(
+                shape=(small_image.shape[0], small_image.shape[1], 2),
+                dtype=np.float32)
+            PAF[..., 0] = jointb[0] - jointa[0]
+            PAF[..., 1] = jointb[1] - jointa[1]
+            mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1])
+            PAF /= (np.dstack((mag, mag)) + 1e-5)
+
+            single_PAF = PAF * np.dstack((canvas, canvas))
+            map_list.append(
+                cv2.GaussianBlur(single_PAF,
+                                 (kernel_size * 3, kernel_size * 3), 0))
+
+            mask_list.append(
+                cv2.GaussianBlur(canvas.copy(),
+                                 (kernel_size * 3, kernel_size * 3), 0))
+            PAF_all = PAF_all * (1.0 - np.dstack(
+                (canvas, canvas))) + single_PAF
+
+    PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0)
+    PAF_all = cv2.resize(
+        PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR)
+    map_list.append(PAF_all)
+    return PAF_all, map_list, mask_list
+
+
+def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None):
+    if type(joints) == list:
+        joints = np.array(joints)
+    assert stack_mode == 'column' or stack_mode == 'depth'
+
+    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
+                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
+
+    def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0):
+        jointa = joints[a]
+        jointb = joints[b]
+
+        temp1 = int((jointa[0] - x_offset) * scale)
+        temp2 = int((jointa[1] - y_offset) * scale)
+        temp3 = int((jointb[0] - x_offset) * scale)
+        temp4 = int((jointb[1] - y_offset) * scale)
+
+        cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width)
+
+    roi_box = input_roi_box
+
+    roi_box_width = roi_box[3] - roi_box[2]
+    roi_box_height = roi_box[1] - roi_box[0]
+    short_side_length = min(roi_box_width, roi_box_height)
+    line_width = short_side_length // 30
+
+    line_width = max(line_width, 2)
+
+    map_cube = np.zeros(
+        shape=(roi_box_height, roi_box_width, len(part_orders) + 1),
+        dtype=np.float32)
+
+    use_line_width = min(5, line_width)
+    fx = use_line_width * 1.0 / line_width  # fx 最大值为1
+
+    if fx < 0.99:
+        map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx)
+
+    for c, pair in enumerate(part_orders):
+        tmp = map_cube[..., c].copy()
+        link(
+            tmp,
+            pair[0],
+            pair[1], (2.0, 2.0, 2.0),
+            use_line_width,
+            scale=fx,
+            x_offset=roi_box[2],
+            y_offset=roi_box[0])
+        map_cube[..., c] = tmp
+
+        tmp = map_cube[..., -1].copy()
+        link(
+            tmp,
+            pair[0],
+            pair[1], (2.0, 2.0, 2.0),
+            use_line_width,
+            scale=fx,
+            x_offset=roi_box[2],
+            y_offset=roi_box[0])
+        map_cube[..., -1] = tmp
+
+    map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height))
+
+    if stack_mode == 'depth':
+        return map_cube, roi_box
+    elif stack_mode == 'column':
+        joint_maps = []
+        for c in range(len(part_orders) + 1):
+            joint_maps.append(map_cube[..., c])
+        joint_map = np.column_stack(joint_maps)
+
+        return joint_map, roi_box
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    tl = line_thickness or round(
+        0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label, (c1[0], c1[1] - 2),
+            0,
+            tl / 3, [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA)
+
+
+def draw_line(im, points, color, stroke_size=2, closed=False):
+    points = points.astype(np.int32)
+    for i in range(len(points) - 1):
+        cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
+                 stroke_size)
+    if closed:
+        cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)
+
+
+def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
+    left = bbox[0]
+    top = bbox[1]
+
+    right = bbox[2]
+    bottom = bbox[3]
+
+    roi_width = right - left
+    roi_height = bottom - top
+
+    new_left = left - int(roi_width * enlarge_ratio)
+    new_left = 0 if new_left < 0 else new_left
+
+    new_top = top - int(roi_height * enlarge_ratio)
+    new_top = 0 if new_top < 0 else new_top
+
+    new_right = right + int(roi_width * enlarge_ratio)
+    new_right = img_width if new_right > img_width else new_right
+
+    new_bottom = bottom + int(roi_height * enlarge_ratio)
+    new_bottom = img_height if new_bottom > img_height else new_bottom
+
+    bbox = [new_left, new_top, new_right, new_bottom]
+
+    bbox = [int(x) for x in bbox]
+
+    return bbox
+
+
+def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')):
+    map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list]
+    map_concat = torch.stack(tuple(map_list_cuda), dim=-1)
+
+    map_concat = torch.abs(map_concat)
+
+    map_concat[map_concat < threshold] = 0
+    map_concat[map_concat > 1e-5] = 1.0
+
+    sum_map = torch.sum(map_concat, dim=2)
+    a = torch.ones_like(sum_map)
+    acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map))
+
+    fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map)
+
+    fusion_map = fusion_map.float()
+    acc_map = acc_map.float()
+
+    fusion_map = fusion_map.cpu().numpy().astype(np.float32)
+    acc_map = acc_map.cpu().numpy().astype(np.float32)
+
+    return fusion_map, acc_map
+
+
+def gen_border_shade(height, width, height_band, width_band):
+    height_ratio = height_band * 1.0 / height
+    width_ratio = width_band * 1.0 / width
+
+    _height_band = int(256 * height_ratio)
+    _width_band = int(256 * width_ratio)
+
+    canvas = np.zeros((256, 256), dtype=np.float32)
+
+    canvas[_height_band // 2:-_height_band // 2,
+           _width_band // 2:-_width_band // 2] = 1.0
+
+    canvas = cv2.blur(canvas, (_height_band, _width_band))
+
+    canvas = cv2.resize(canvas, (width, height))
+
+    return canvas
+
+
+def get_mask_bbox(mask, threshold=127):
+    ret, mask = cv2.threshold(mask, threshold, 1, 0)
+
+    if cv2.countNonZero(mask) == 0:
+        return [None, None, None, None]
+
+    col_acc = np.sum(mask, 0)
+    row_acc = np.sum(mask, 1)
+
+    col_acc = col_acc.tolist()
+    row_acc = row_acc.tolist()
+
+    for x in range(len(col_acc)):
+        if col_acc[x] > 0:
+            left = x
+            break
+
+    for x in range(1, len(col_acc)):
+        if col_acc[-x] > 0:
+            right = len(col_acc) - x
+            break
+
+    for x in range(len(row_acc)):
+        if row_acc[x] > 0:
+            top = x
+            break
+
+    for x in range(1, len(row_acc)):
+        if row_acc[-x] > 0:
+            bottom = len(row_acc[::-1]) - x
+            break
+    return [top, bottom, left, right]
+
+
+def visualize_flow(flow):
+    h, w = flow.shape[:2]
+    hsv = np.zeros((h, w, 3), np.uint8)
+    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
+
+    hsv[..., 0] = ang * 180 / np.pi / 2
+    hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
+    hsv[..., 2] = 255
+    bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
+    bgr = bgr * 1.0 / 255
+    return bgr.astype(np.float32)
+
+
+def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1):
+
+    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
+                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
+
+    abandon_idxs = [0, 1, 14, 15, 16, 17]
+    # draw joints
+    for i, joint in enumerate(joints):
+        if i in abandon_idxs:
+            continue
+        if joint[-1] > confidence_threshold:
+
+            cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2)
+            if show_text:
+                cv2.putText(image,
+                            str(i) + '[{:.2f}]'.format(joint[-1]),
+                            (int(joint[0]), int(joint[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+    # draw link
+    for pair in part_orders:
+        if joints[pair[0]][-1] > confidence_threshold and joints[
+                pair[1]][-1] > confidence_threshold:
+            cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])),
+                     (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color,
+                     2)
+    return image
+
+
+def get_heatmap_cv(img, magn, max_flow_mag):
+    min_flow_mag = .5
+    cv_magn = np.clip(
+        255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7),
+        a_min=0,
+        a_max=255).astype(np.uint8)
+    if img.dtype != np.uint8:
+        img = (255 * img).astype(np.uint8)
+
+    heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET)
+    heatmap_img = heatmap_img[..., ::-1]
+
+    h, w = magn.shape
+    img_alpha = np.ones((h, w), dtype=np.double)[:, :, None]
+    heatmap_alpha = np.clip(
+        magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7
+    heatmap_alpha[heatmap_alpha < .2]**.5
+    pm_hm = heatmap_img * heatmap_alpha
+    pm_img = img * img_alpha
+    cv_out = pm_hm + pm_img * (1 - heatmap_alpha)
+    cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8)
+
+    return cv_out
+
+
+def save_heatmap_cv(img, flow, supression=2):
+
+    flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
+    flow_magn -= supression
+    flow_magn[flow_magn <= 0] = 0
+    cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
+    return cv_out
+
+
+@numba.jit(nopython=True, parallel=False)
+def bilinear_interp(x, y, v11, v12, v21, v22):
+    temp1 = (v11 * (1 - y) + v12 * y) * (1 - x)
+    temp2 = (v21 * (1 - y) + v22 * y) * x
+    result = temp1 + temp2
+    return result
+
+
+@numba.jit(nopython=True, parallel=False)
+def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand,
+                     height_expand):
+    srcW = oriImg.shape[1]
+    srcH = oriImg.shape[0]
+
+    newImg = oriImg.copy()
+
+    for i in range(srcH):
+        for j in range(srcW):
+            _i = i
+            _j = j
+
+            deltaX = rDx[_i, _j]
+            deltaY = rDy[_i, _j]
+
+            nx = _j + deltaX * transRatio
+            ny = _i + deltaY * transRatio
+
+            if nx >= srcW - width_expand - 1:
+                if nx > srcW - 1:
+                    nx = srcW - 1
+
+            if ny >= srcH - height_expand - 1:
+                if ny > srcH - 1:
+                    ny = srcH - 1
+
+            if nx < width_expand:
+                if nx < 0:
+                    nx = 0
+
+            if ny < height_expand:
+                if ny < 0:
+                    ny = 0
+
+            nxi = int(math.floor(nx))
+            nyi = int(math.floor(ny))
+            nxi1 = int(math.ceil(nx))
+            nyi1 = int(math.ceil(ny))
+
+            for ll in range(3):
+                newImg[_i, _j,
+                       ll] = bilinear_interp(ny - nyi, nx - nxi,
+                                             oriImg[nyi, nxi,
+                                                    ll], oriImg[nyi, nxi1, ll],
+                                             oriImg[nyi1, nxi,
+                                                    ll], oriImg[nyi1, nxi1,
+                                                                ll])
+    return newImg
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 717ff4dd..c16e256e 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -184,6 +184,7 @@ TASK_OUTPUTS = {
     Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG],
     Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG],
     Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
+    Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG],
 
     # live category recognition result for single video
     # {
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 7fa66b5f..c9a70d14 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -75,6 +75,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_bart_text-error-correction_chinese'),
     Tasks.image_captioning: (Pipelines.image_captioning,
                              'damo/ofa_image-caption_coco_large_en'),
+    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
+                                 'damo/cv_flow-based-body-reshaping_damo'),
     Tasks.image_portrait_stylization:
     (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
diff --git a/modelscope/pipelines/cv/image_body_reshaping_pipeline.py b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py
new file mode 100644
index 00000000..c3600eb5
--- /dev/null
+++ b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_body_reshaping, module_name=Pipelines.image_body_reshaping)
+class ImageBodyReshapingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image body reshaping pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        logger.info('body reshaping model init done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        output = self.model.inference(input['img'])
+        result = {'outputs': output}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_img = inputs['outputs']
+        return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 5bc27c03..2331dc85 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -60,7 +60,7 @@ class CVTasks(object):
     image_to_image_generation = 'image-to-image-generation'
     image_style_transfer = 'image-style-transfer'
     image_portrait_stylization = 'image-portrait-stylization'
-
+    image_body_reshaping = 'image-body-reshaping'
     image_embedding = 'image-embedding'
 
     product_retrieval_embedding = 'product-retrieval-embedding'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 5a2d7763..f907256d 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -13,6 +13,7 @@ ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
+numba
 onnxruntime>=1.10
 pai-easycv>=0.6.3.6
 pandas
diff --git a/tests/pipelines/test_image_body_reshaping.py b/tests/pipelines/test_image_body_reshaping.py
new file mode 100644
index 00000000..e1955e94
--- /dev/null
+++ b/tests/pipelines/test_image_body_reshaping.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageBodyReshapingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_body_reshaping
+        self.model_id = 'damo/cv_flow-based-body-reshaping_damo'
+        self.test_image = 'data/test/images/image_body_reshaping.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        if result is not None:
+            cv2.imwrite('result_bodyreshaping.png',
+                        result[OutputKeys.OUTPUT_IMG])
+            print(
+                f'Output written to {osp.abspath("result_body_reshaping.png")}'
+            )
+        else:
+            raise Exception('Testing failed: invalid output')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id)
+        image_body_reshaping = pipeline(
+            Tasks.image_body_reshaping, model=model_dir)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        image_body_reshaping = pipeline(
+            Tasks.image_body_reshaping, model=self.model_id)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        image_body_reshaping = pipeline(Tasks.image_body_reshaping)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 12ee711f682b5d90d35eb6c7ec024ccf87ee619a Mon Sep 17 00:00:00 2001
From: "wenqi.oywq" <wenqi.oywq@alibaba-inc.com>
Date: Tue, 11 Oct 2022 11:07:45 +0800
Subject: [PATCH 02/57] [to #42322933]add license header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10353812

    * add license header
---
 modelscope/models/cv/image_color_enhance/csrnet.py             | 3 +++
 .../models/cv/image_color_enhance/image_color_enhance.py       | 1 +
 2 files changed, 4 insertions(+)

diff --git a/modelscope/models/cv/image_color_enhance/csrnet.py b/modelscope/models/cv/image_color_enhance/csrnet.py
index 782cd528..502abf88 100644
--- a/modelscope/models/cv/image_color_enhance/csrnet.py
+++ b/modelscope/models/cv/image_color_enhance/csrnet.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Jingwen He,
+# made publicly available at https://github.com/hejingwenhejingwen/CSRNet
+
 import functools
 import math
 
diff --git a/modelscope/models/cv/image_color_enhance/image_color_enhance.py b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
index 382cc152..0bd74197 100644
--- a/modelscope/models/cv/image_color_enhance/image_color_enhance.py
+++ b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from copy import deepcopy
 from typing import Dict, Union

From 4bd72e528ad9938e131908c5a67920666fcdcae1 Mon Sep 17 00:00:00 2001
From: pangda <pangda@alibaba-inc.com>
Date: Tue, 11 Oct 2022 11:14:34 +0800
Subject: [PATCH 03/57] [to #42322933] support restore best checkpoint after
 training
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 支持训练完成后自动恢复best ckpt，方便在不同测试集上进行测试
2. build_optimizer/build_lr_scheduler改为成员函数，方便重载（如模型分层lr）
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10348255
---
 modelscope/trainers/hooks/checkpoint_hook.py |  7 +++++++
 modelscope/trainers/trainer.py               | 10 ++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index 220929b8..c9f51a88 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -216,6 +216,7 @@ class BestCkptSaverHook(CheckpointHook):
         by_epoch (bool): Save best checkpoints by epoch or by iteration.
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): Output directory to save best checkpoint.
+        restore_best (bool): Whether to restore the best checkpoint after training.
     """
 
     PRIORITY = Priority.LOW
@@ -228,6 +229,7 @@ class BestCkptSaverHook(CheckpointHook):
                  save_optimizer=True,
                  save_dir=None,
                  save_file_name=None,
+                 restore_best=False,
                  interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
@@ -241,6 +243,7 @@ class BestCkptSaverHook(CheckpointHook):
         self._best_metric = None
         self._best_ckpt_file = None
         self.save_file_name = save_file_name
+        self.restore_best = restore_best
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
@@ -305,3 +308,7 @@ class BestCkptSaverHook(CheckpointHook):
             self.logger.warn(
                 'The state_dict is not available, the best metric value will be affected.'
             )
+
+    def after_run(self, trainer):
+        if self.restore_best:
+            self.load_checkpoint(self._best_ckpt_file, trainer)
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index a01d9b59..4c21d63f 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -664,6 +664,12 @@ class EpochBasedTrainer(BaseTrainer):
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
+    def build_optimizer(self, cfg: ConfigDict, default_args: dict = None):
+        return build_optimizer(self.model, cfg=cfg, default_args=default_args)
+
+    def build_lr_scheduler(self, cfg: ConfigDict, default_args: dict = None):
+        return build_lr_scheduler(cfg=cfg, default_args=default_args)
+
     def create_optimizer_and_scheduler(self):
         """ Create optimizer and lr scheduler
 
@@ -680,7 +686,7 @@ class EpochBasedTrainer(BaseTrainer):
         optim_options = {}
         if optimizer_cfg is not None:
             optim_options = optimizer_cfg.pop('options', {})
-            optimizer = build_optimizer(self.model, cfg=optimizer_cfg)
+            optimizer = self.build_optimizer(cfg=optimizer_cfg)
 
         if lr_scheduler is None:
             lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None)
@@ -691,7 +697,7 @@ class EpochBasedTrainer(BaseTrainer):
         if lr_scheduler_cfg is not None:
             assert optimizer is not None
             lr_options = lr_scheduler_cfg.pop('options', {})
-            lr_scheduler = build_lr_scheduler(
+            lr_scheduler = self.build_lr_scheduler(
                 cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer})
 
         self.optimizer = optimizer

From 333c11c0a61c780a524d1b3b07793cff0d46a8da Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 11 Oct 2022 14:06:07 +0800
Subject: [PATCH 04/57] [to #42322933] fix: missing type bytes in
 InputType.AUDIO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

已经和谦言讨论过，确认可添加
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10358110

    * fix: missing type bytes in InputType.AUDIO
---
 modelscope/pipeline_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index de9814a7..2b14c278 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -28,7 +28,7 @@ class InputType(object):
 INPUT_TYPE = {
     InputType.IMAGE: (str, np.ndarray, Image.Image),
     InputType.TEXT: str,
-    InputType.AUDIO: (str, np.ndarray),
+    InputType.AUDIO: (str, bytes, np.ndarray),
     InputType.VIDEO: (str, np.ndarray, cv2.VideoCapture),
     InputType.BOX: (list, np.ndarray),
     InputType.DICT: (dict, type(None)),

From 09d2296f36f1a301dc5e144e00a692dfce2675ee Mon Sep 17 00:00:00 2001
From: "laiyin.lyc" <laiyin.lyc@alibaba-inc.com>
Date: Tue, 11 Oct 2022 16:05:20 +0800
Subject: [PATCH 05/57] [to #44847108] add sparsity hook (pst algorithm)       
  Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10198228

    * [to #44847108] add sparsity hook (pst algorithm)
---
 modelscope/metainfo.py                        |   3 +
 modelscope/trainers/hooks/__init__.py         |   4 +-
 .../trainers/hooks/compression/__init__.py    |  24 ++
 .../hooks/compression/sparsity_hook.py        | 131 +++++++++++
 .../trainers/hooks/compression/utils.py       | 208 ++++++++++++++++++
 tests/trainers/hooks/compression/__init__.py  |   0
 .../hooks/compression/test_sparsity_hook.py   | 113 ++++++++++
 7 files changed, 482 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/trainers/hooks/compression/__init__.py
 create mode 100644 modelscope/trainers/hooks/compression/sparsity_hook.py
 create mode 100644 modelscope/trainers/hooks/compression/utils.py
 create mode 100644 tests/trainers/hooks/compression/__init__.py
 create mode 100644 tests/trainers/hooks/compression/test_sparsity_hook.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1b8c4720..77627abc 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -404,6 +404,9 @@ class Hooks(object):
     IterTimerHook = 'IterTimerHook'
     EvaluationHook = 'EvaluationHook'
 
+    # Compression
+    SparsityHook = 'SparsityHook'
+
 
 class LR_Schedulers(object):
     """learning rate scheduler is defined here
diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py
index f133041b..a2e0cf4b 100644
--- a/modelscope/trainers/hooks/__init__.py
+++ b/modelscope/trainers/hooks/__init__.py
@@ -6,10 +6,11 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .builder import HOOKS, build_hook
     from .checkpoint_hook import BestCkptSaverHook, CheckpointHook
+    from .compression import SparsityHook
     from .evaluation_hook import EvaluationHook
     from .hook import Hook
     from .iter_timer_hook import IterTimerHook
-    from .logger import TextLoggerHook, TensorboardHook
+    from .logger import TensorboardHook, TextLoggerHook
     from .lr_scheduler_hook import LrSchedulerHook
     from .optimizer import (ApexAMPOptimizerHook, NoneOptimizerHook,
                             OptimizerHook, TorchAMPOptimizerHook)
@@ -19,6 +20,7 @@ else:
     _import_structure = {
         'builder': ['HOOKS', 'build_hook'],
         'checkpoint_hook': ['BestCkptSaverHook', 'CheckpointHook'],
+        'compression': ['SparsityHook'],
         'evaluation_hook': ['EvaluationHook'],
         'hook': ['Hook'],
         'iter_timer_hook': ['IterTimerHook'],
diff --git a/modelscope/trainers/hooks/compression/__init__.py b/modelscope/trainers/hooks/compression/__init__.py
new file mode 100644
index 00000000..f755b2ca
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .sparsity_hook import SparsityHook
+    from .utils import SparseLinear, convert_sparse_network
+
+else:
+    _import_structure = {
+        'sparsity_hook': ['SparsityHook'],
+        'utils': ['convert_sparse_network', 'SparseLinear'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/hooks/compression/sparsity_hook.py b/modelscope/trainers/hooks/compression/sparsity_hook.py
new file mode 100644
index 00000000..993488d8
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/sparsity_hook.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+from modelscope import __version__
+from modelscope.metainfo import Hooks
+from modelscope.trainers.hooks.builder import HOOKS
+from modelscope.trainers.hooks.hook import Hook
+from modelscope.trainers.hooks.priority import Priority
+from modelscope.utils.checkpoint import save_checkpoint
+from modelscope.utils.torch_utils import is_master
+
+
+@HOOKS.register_module(module_name=Hooks.SparsityHook)
+class SparsityHook(Hook):
+
+    PRIORITY = Priority.HIGHEST
+
+    def __init__(self, pruning_method, config={}, save_dir=None):
+        self.pruning_method = pruning_method
+        self.save_dir = save_dir
+
+        self.compress_module = config.get('compress_module', [])
+        self.weight_rank = config.get('weight_rank', 8)
+        self.weight_beta = config.get('weight_beta', 1)
+        self.mask_rank = config.get('mask_rank', 8)
+        self.mask_alpha1 = config.get('mask_alpha1', 1)
+        self.mask_alpha2 = config.get('mask_alpha2', 1)
+
+        self.step = 0
+        self.total_step = 0
+        self.frequency = config.get('frequency', 1)
+        self.initial_warmup = config.get('initial_warmup', 0.1)
+        self.final_warmup = config.get('final_warmup', 0.3)
+        self.initial_sparsity = config.get('initial_sparsity', 0.0)
+        self.final_sparsity = config.get('final_sparsity', 0.0)
+
+    def before_run(self, trainer):
+        import torch
+
+        from .utils import SparseLinear, convert_sparse_network
+
+        if self.save_dir is None:
+            self.save_dir = trainer.work_dir
+
+        if len(self.compress_module) == 0:
+            convert_sparse_network(
+                trainer.model,
+                pruning_method=self.pruning_method,
+                weight_rank=self.weight_rank,
+                weight_beta=self.weight_beta,
+                mask_rank=self.mask_rank,
+                mask_alpha1=self.mask_alpha1,
+                mask_alpha2=self.mask_alpha2,
+                logger=trainer.logger,
+            )
+        else:
+            for cm in self.compress_module:
+                for name, module in trainer.model.named_modules():
+                    if name != cm:
+                        continue
+                    convert_sparse_network(
+                        module,
+                        pruning_method=self.pruning_method,
+                        weight_rank=self.weight_rank,
+                        weight_beta=self.weight_beta,
+                        mask_rank=self.mask_rank,
+                        mask_alpha1=self.mask_alpha1,
+                        mask_alpha2=self.mask_alpha2,
+                        logger=trainer.logger,
+                    )
+
+        for i in range(len(trainer.optimizer.param_groups)):
+            new_train_params = []
+            for param in trainer.optimizer.param_groups[i]['params']:
+                is_find = False
+                for name, module in trainer.model.named_modules():
+                    if isinstance(module, SparseLinear):
+                        if torch.equal(param.half(),
+                                       module.weight.data.half()):
+                            is_find = True
+                            break
+
+                if not is_find:
+                    new_train_params.append(param)
+
+            trainer.optimizer.param_groups[i]['params'] = new_train_params
+
+        new_params = []
+        for name, module in trainer.model.named_modules():
+            if isinstance(module, SparseLinear):
+                new_params.extend(
+                    [p for p in module.parameters() if p.requires_grad])
+
+        trainer.optimizer.add_param_group({'params': new_params})
+
+        self.total_step = trainer.iters_per_epoch * trainer._max_epochs
+
+    def before_train_iter(self, trainer):
+        from .utils import schedule_sparsity_ratio, update_network_sparsity
+
+        cur_sparsity = schedule_sparsity_ratio(
+            self.step,
+            self.total_step,
+            self.frequency,
+            self.initial_warmup,
+            self.final_warmup,
+            self.initial_sparsity,
+            self.final_sparsity,
+        )
+
+        update_network_sparsity(trainer.model, cur_sparsity)
+
+        if is_master():
+            trainer.logger.info(
+                f'Step[{self.step}/{self.total_step}] current sparsity ratio = {cur_sparsity}'
+            )
+
+        self.step += 1
+
+    def after_run(self, trainer):
+        from .utils import generate_sparse_model
+
+        generate_sparse_model(trainer.model, logger=trainer.logger)
+
+        self._save_checkpoint(trainer)
+
+    def _save_checkpoint(self, trainer):
+        if is_master():
+            trainer.logger.info('Saving checkpoint at final compress')
+        cur_save_name = os.path.join(self.save_dir, 'compress_model.pth')
+        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
diff --git a/modelscope/trainers/hooks/compression/utils.py b/modelscope/trainers/hooks/compression/utils.py
new file mode 100644
index 00000000..59418201
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/utils.py
@@ -0,0 +1,208 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+
+from modelscope.utils.torch_utils import is_master
+
+
+class SparseBinarizer(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, mask_scores, sparsity):
+        num_prune = int(mask_scores.numel() * sparsity)
+        prune_indices = torch.argsort(mask_scores.reshape(-1))[:num_prune]
+        mask = mask_scores.clone().fill_(1)
+        mask.reshape(-1)[prune_indices] = 0.0
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None
+
+
+class SparseLinear(nn.Module):
+    """
+    Fully Connected layer with on the fly adaptive mask.
+    """
+
+    def __init__(
+        self,
+        module,
+        pruning_method='pst',
+        weight_rank=8,
+        weight_beta=1.0,
+        mask_rank=8,
+        mask_alpha1=1.0,
+        mask_alpha2=1.0,
+    ):
+        super(SparseLinear, self).__init__()
+        self.module = module
+        out_features = self.module.weight.shape[0]
+        in_features = self.module.weight.shape[1]
+
+        self.weight = self.module.weight
+        self.module.weight = None
+        self.module._parameters.pop('weight')
+
+        self.pruning_method = pruning_method
+
+        self.cur_sparsity = 0.0
+
+        if self.pruning_method == 'pst':
+            self.weight_rank = weight_rank
+            self.weight_beta = weight_beta
+            self.mask_rank = mask_rank
+            self.mask_alpha1 = mask_alpha1
+            self.mask_alpha2 = mask_alpha2
+
+            # create trainable params
+            self.weight_U = nn.Parameter(
+                torch.randn(out_features, self.weight_rank).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.weight_V = nn.Parameter(
+                torch.zeros(self.weight_rank, in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+
+            self.mask_scores_A = nn.Parameter(
+                torch.randn(out_features, self.mask_rank).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_B = nn.Parameter(
+                torch.zeros(self.mask_rank, in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_R = nn.Parameter(
+                torch.zeros(out_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_C = nn.Parameter(
+                torch.zeros(in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+
+            self.weight.requires_grad = False
+            if self.module.bias is not None:
+                self.module.bias.requires_grad = False
+
+    def forward(self, *inputs):
+        if self.pruning_method == 'pst':
+            weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V
+            mask_scores = (
+                weight.abs()
+                + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B
+                + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1)
+                                      + self.mask_scores_C.unsqueeze(0)))
+
+            mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity)
+            masked_weight = mask * weight
+
+            self.module.weight = masked_weight
+            return self.module(*inputs)
+        else:
+            return self.module(*inputs)
+
+    def convert(self):
+        if self.pruning_method == 'pst':
+            weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V
+            mask_scores = (
+                weight.abs()
+                + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B
+                + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1)
+                                      + self.mask_scores_C.unsqueeze(0)))
+
+            mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity)
+
+            masked_weight = mask * weight
+            self.module.weight = nn.Parameter(masked_weight.data)
+
+
+def _setattr(model, name, module):
+    name_list = name.split('.')
+    for name in name_list[:-1]:
+        model = getattr(model, name)
+    setattr(model, name_list[-1], module)
+
+
+def convert_sparse_network(
+    model,
+    pruning_method,
+    weight_rank,
+    weight_beta,
+    mask_rank,
+    mask_alpha1,
+    mask_alpha2,
+    logger=None,
+):
+    compress_module = [nn.Linear]
+    try:
+        from megatron import mpu
+        compress_module.extend(
+            [mpu.RowParallelLinear, mpu.ColumnParallelLinear])
+    except ImportError:
+        pass
+
+    for name, module in model.named_modules():
+        if type(module) in compress_module:
+            new_module = SparseLinear(
+                module,
+                pruning_method,
+                weight_rank,
+                weight_beta,
+                mask_rank,
+                mask_alpha1,
+                mask_alpha2,
+            )
+
+            # replace original module by new sparse module
+            _setattr(model, name, new_module)
+
+            if is_master():
+                if logger:
+                    logger.info(f'convert {name} to sparse module.')
+                else:
+                    print(f'convert {name} to sparse module.')
+
+
+def update_network_sparsity(model, sparsity):
+    for name, module in model.named_modules():
+        if isinstance(module, SparseLinear):
+            module.cur_sparsity = sparsity
+
+
+def schedule_sparsity_ratio(
+    step,
+    total_step,
+    frequency,
+    initial_warmup,
+    final_warmup,
+    initial_sparsity,
+    final_sparsity,
+):
+    if step <= initial_warmup * total_step:
+        sparsity = initial_sparsity
+    elif step > (total_step - final_warmup * total_step):
+        sparsity = final_sparsity
+    else:
+        spars_warmup_steps = initial_warmup * total_step
+        spars_schedu_steps = (final_warmup + initial_warmup) * total_step
+        step = (step - spars_warmup_steps) // frequency * frequency
+        mul_coeff = 1 - step / (total_step - spars_schedu_steps)
+        sparsity = final_sparsity + (initial_sparsity - final_sparsity) * (
+            mul_coeff**3)
+    return sparsity
+
+
+def generate_sparse_model(model, logger=None):
+    # generate sparse weight for saving
+    for name, module in model.named_modules():
+        if isinstance(module, SparseLinear):
+            module.convert()
+
+            _setattr(model, name, module.module)
+
+            if is_master():
+                if logger:
+                    logger.info(f'convert {name} weight to sparse weight, \
+                            sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.'
+                                )
+                else:
+                    print(f'convert {name} weight to sparse, \
+                            sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.'
+                          )
diff --git a/tests/trainers/hooks/compression/__init__.py b/tests/trainers/hooks/compression/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/hooks/compression/test_sparsity_hook.py b/tests/trainers/hooks/compression/test_sparsity_hook.py
new file mode 100644
index 00000000..4af4dcdb
--- /dev/null
+++ b/tests/trainers/hooks/compression/test_sparsity_hook.py
@@ -0,0 +1,113 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import SGD
+from torch.optim.lr_scheduler import MultiStepLR
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, TrainerStages
+from modelscope.utils.test_utils import create_dummy_test_dataset
+
+dummy_dataset = create_dummy_test_dataset(
+    np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)
+
+
+class DummyModel(nn.Module, Model):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(5, 10)
+        self.bn = nn.BatchNorm1d(10)
+
+    def forward(self, feat, labels):
+        x = self.linear(feat)
+
+        x = self.bn(x)
+        loss = torch.sum(x)
+        return dict(logits=x, loss=loss)
+
+
+class SparsityHookTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir)
+
+    def test_sparsity_hook(self):
+        json_cfg = {
+            'task': 'image_classification',
+            'train': {
+                'work_dir':
+                self.tmp_dir,
+                'dataloader': {
+                    'batch_size_per_gpu': 2,
+                    'workers_per_gpu': 1
+                },
+                'hooks': [{
+                    'type': 'SparsityHook',
+                    'pruning_method': 'pst',
+                    'config': {
+                        'weight_rank': 1,
+                        'mask_rank': 1,
+                        'final_sparsity': 0.9,
+                        'frequency': 1,
+                    },
+                }],
+            },
+        }
+
+        config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
+        with open(config_path, 'w') as f:
+            json.dump(json_cfg, f)
+
+        model = DummyModel()
+        optimizer = SGD(model.parameters(), lr=0.01)
+        lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4])
+        trainer_name = Trainers.default
+        kwargs = dict(
+            cfg_file=config_path,
+            model=model,
+            train_dataset=dummy_dataset,
+            optimizers=(optimizer, lr_scheduler),
+            max_epochs=5,
+            device='cpu',
+        )
+
+        trainer = build_trainer(trainer_name, kwargs)
+        train_dataloader = trainer._build_dataloader_with_dataset(
+            trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
+        trainer.register_optimizers_hook()
+        trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
+        trainer.train_dataloader = train_dataloader
+        trainer.data_loader = train_dataloader
+        trainer.invoke_hook(TrainerStages.before_run)
+        for i in range(trainer._epoch, trainer._max_epochs):
+            trainer.invoke_hook(TrainerStages.before_train_epoch)
+            for _, data_batch in enumerate(train_dataloader):
+                trainer.invoke_hook(TrainerStages.before_train_iter)
+                trainer.train_step(trainer.model, data_batch)
+                trainer.invoke_hook(TrainerStages.after_train_iter)
+            trainer.invoke_hook(TrainerStages.after_train_epoch)
+        trainer.invoke_hook(TrainerStages.after_run)
+
+        self.assertEqual(
+            torch.mean(1.0 * (trainer.model.linear.weight == 0)), 0.9)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 67d6fa001da5cb58b81dcb68968355995f8e586f Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:17:51 +0800
Subject: [PATCH 06/57] [to #42322933] unify keys forbody_3d_keypoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

统一关键点检测输出key的名字
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10359335
---
 modelscope/outputs.py                                 | 4 ++--
 modelscope/pipelines/cv/body_3d_keypoints_pipeline.py | 4 ++--
 tests/pipelines/test_body_3d_keypoints.py             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c16e256e..331f4816 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -222,7 +222,7 @@ TASK_OUTPUTS = {
 
     # 3D human body keypoints detection result for single sample
     # {
-    #   "poses": [		    # 3d pose coordinate in camera coordinate
+    #   "keypoints": [		    # 3d pose coordinate in camera coordinate
     #     	[[x, y, z]*17],	# joints of per image
     #     	[[x, y, z]*17],
     #     	...
@@ -236,7 +236,7 @@ TASK_OUTPUTS = {
     # and is only avaialbe when the "render" option is enabled.
     # }
     Tasks.body_3d_keypoints:
-    [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
+    [OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
 
     # 2D hand keypoints result for single sample
     # {
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index b0faa1e0..c3f4e8c1 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -180,7 +180,7 @@ class Body3DKeypointsPipeline(Pipeline):
         return res
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        res = {OutputKeys.POSES: [], OutputKeys.TIMESTAMPS: []}
+        res = {OutputKeys.KEYPOINTS: [], OutputKeys.TIMESTAMPS: []}
 
         if not input['success']:
             pass
@@ -197,7 +197,7 @@ class Body3DKeypointsPipeline(Pipeline):
                 self.render_prediction(pred_3d_pose, output_video_path)
                 res[OutputKeys.OUTPUT_VIDEO] = output_video_path
 
-            res[OutputKeys.POSES] = pred_3d_pose
+            res[OutputKeys.KEYPOINTS] = pred_3d_pose
             res[OutputKeys.TIMESTAMPS] = self.timestamps
         return res
 
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 6f27f12d..6e671d2e 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -21,7 +21,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
         output = pipeline(pipeline_input, output_video='./result.mp4')
-        poses = np.array(output[OutputKeys.POSES])
+        poses = np.array(output[OutputKeys.KEYPOINTS])
         print(f'result 3d points shape {poses.shape}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')

From e240edea7ebbd7beb66246fb18b071a6ba0a65c0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:20:11 +0800
Subject: [PATCH 07/57] [to #42322933]t5 bug fixex

---
 modelscope/preprocessors/nlp/nlp_base.py     | 4 +---
 tests/pipelines/test_text2text_generation.py | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 6b559de9..a9be0cb0 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -417,14 +417,12 @@ class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
                  tokenizer=None,
                  mode=ModeKeys.INFERENCE,
                  **kwargs):
-        self.tokenizer = self.build_tokenizer(
-            model_dir) if tokenizer is None else tokenizer
         kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
         kwargs['padding'] = kwargs.get('padding', False)
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         text_a, _, _ = self.parse_text_and_label(data)
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index a39562f5..2506547e 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -18,7 +18,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id = 'damo/t5-cn-base-test'
         self.input = '中国的首都位于<extra_id_0>。'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_T5(self):
         cache_path = snapshot_download(self.model_id)
         model = T5ForConditionalGeneration(cache_path)
@@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             preprocessor=preprocessor)
         print(pipeline_ins(self.input))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation, model=self.model_id)

From 65be443e982755a96915b363af6b6ad2dfe5c827 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:22:58 +0800
Subject: [PATCH 08/57] [to #41669377] Add more models to test in tts UT

Link https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754#tab=detail
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754
---
 .../audio/tts/models/datasets/__init__.py     |  0
 tests/pipelines/test_text_to_speech.py        | 70 +++++++++++++++----
 2 files changed, 58 insertions(+), 12 deletions(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index f659e59b..0caf1c84 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -27,21 +27,67 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
 
     def setUp(self) -> None:
         self.task = Tasks.text_to_speech
-        self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
+        zhcn_text = '今天北京天气怎么样'
+        en_text = 'How is the weather in Beijing?'
+        zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo']
+        enus_voice = ['andy', 'annie']
+        engb_voice = ['luca', 'luna']
+        self.tts_test_cases = []
+        for voice in zhcn_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'zh-cn')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': zhcn_text
+            })
+        for voice in enus_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'en-us')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': en_text
+            })
+        for voice in engb_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'en-gb')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': en_text
+            })
+        zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
+        enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k'
+        engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k'
+        self.tts_test_cases.append({
+            'voice': 'zhcn',
+            'model_id': zhcn_model_id,
+            'text': zhcn_text
+        })
+        self.tts_test_cases.append({
+            'voice': 'enus',
+            'model_id': enus_model_id,
+            'text': en_text
+        })
+        self.tts_test_cases.append({
+            'voice': 'engb',
+            'model_id': engb_model_id,
+            'text': en_text
+        })
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
-        text = '今天北京天气怎么样？'
-        voice = 'zhitian_emo'
-
-        model = Model.from_pretrained(
-            model_name_or_path=self.model_id, revision='pytorch_am')
-        sambert_hifigan_tts = pipeline(task=self.task, model=model)
-        self.assertTrue(sambert_hifigan_tts is not None)
-        output = sambert_hifigan_tts(input=text, voice=voice)
-        self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
-        pcm = output[OutputKeys.OUTPUT_PCM]
-        write('output.wav', 16000, pcm)
+        for case in self.tts_test_cases:
+            logger.info('test %s' % case['voice'])
+            model = Model.from_pretrained(
+                model_name_or_path=case['model_id'], revision='pytorch_am')
+            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            self.assertTrue(sambert_hifigan_tts is not None)
+            output = sambert_hifigan_tts(input=case['text'])
+            self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
+            pcm = output[OutputKeys.OUTPUT_PCM]
+            write('output_%s.wav' % case['voice'], 16000, pcm)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 4c993638046a051a624375241ae5271509ebb510 Mon Sep 17 00:00:00 2001
From: "james.wjg" <james.wjg@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:24:46 +0800
Subject: [PATCH 09/57] =?UTF-8?q?[to=20#42322933]video=20summarization=20?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20license=20&=20header;=20=E4=BF=AE=E6=94=B9?=
 =?UTF-8?q?=20output=20for=20demo=20service?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

video summarization:
1. 添加 license & header;
2. 修改 output for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260946
---
 .../metrics/video_summarization_metric.py     |  3 ++
 .../models/cv/video_summarization/__init__.py | 23 +++++++++-
 .../cv/video_summarization/base_model.py      |  3 +-
 .../cv/video_summarization/kts/cpd_auto.py    |  3 +-
 .../cv/video_summarization/kts/cpd_nonlin.py  |  3 +-
 .../models/cv/video_summarization/pgl_sum.py  |  3 +-
 .../cv/video_summarization/summarizer.py      | 46 ++++++++++++++++++-
 .../video_summarization_dataset.py            |  5 +-
 modelscope/outputs.py                         | 16 +++++++
 .../cv/video_summarization_pipeline.py        | 13 ++++--
 tests/pipelines/test_video_summarization.py   |  3 --
 11 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/modelscope/metrics/video_summarization_metric.py b/modelscope/metrics/video_summarization_metric.py
index d1867600..40580382 100644
--- a/modelscope/metrics/video_summarization_metric.py
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/cv/video_summarization/__init__.py b/modelscope/models/cv/video_summarization/__init__.py
index 064110f7..15ad61b4 100644
--- a/modelscope/models/cv/video_summarization/__init__.py
+++ b/modelscope/models/cv/video_summarization/__init__.py
@@ -1 +1,22 @@
-from .summarizer import PGLVideoSummarization
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .summarizer import (PGLVideoSummarization, summary_format)
+
+else:
+    _import_structure = {
+        'summarizer': ['PGLVideoSummarization', 'summary_format']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_summarization/base_model.py b/modelscope/models/cv/video_summarization/base_model.py
index 670da251..912ba68d 100644
--- a/modelscope/models/cv/video_summarization/base_model.py
+++ b/modelscope/models/cv/video_summarization/base_model.py
@@ -1,4 +1,5 @@
-# The implementation is based on pytorch-caffe-models, available at https://github.com/crowsonkb/pytorch-caffe-models.
+# Part of the implementation is borrowed and modified from pytorch-caffe-models,
+# publicly available at https://github.com/crowsonkb/pytorch-caffe-models
 
 import cv2
 import numpy as np
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_auto.py b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
index a794ca26..58281df8 100644
--- a/modelscope/models/cv/video_summarization/kts/cpd_auto.py
+++ b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
@@ -1,4 +1,5 @@
-# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+# Part of the implementation is borrowed and modified from KTS,
+# publicly available at https://github.com/TatsuyaShirakawa/KTS
 
 import numpy as np
 
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
index ef2eb6ef..55e279e9 100644
--- a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
+++ b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
@@ -1,4 +1,5 @@
-# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+# Part of the implementation is borrowed and modified from KTS,
+# publicly available at https://github.com/TatsuyaShirakawa/KTS
 
 import numpy as np
 
diff --git a/modelscope/models/cv/video_summarization/pgl_sum.py b/modelscope/models/cv/video_summarization/pgl_sum.py
index ab3010c9..2d27501d 100644
--- a/modelscope/models/cv/video_summarization/pgl_sum.py
+++ b/modelscope/models/cv/video_summarization/pgl_sum.py
@@ -1,4 +1,5 @@
-# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
 
 import math
 
diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py
index c95da025..75251989 100644
--- a/modelscope/models/cv/video_summarization/summarizer.py
+++ b/modelscope/models/cv/video_summarization/summarizer.py
@@ -1,4 +1,5 @@
-# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
 
 import os.path as osp
 from copy import deepcopy
@@ -23,7 +24,8 @@ logger = get_logger()
 def get_change_points(video_feat, n_frame):
     video_feat = np.array(video_feat, np.float32)
     K = np.dot(video_feat, video_feat.T)
-    change_points, _ = cpd_auto(K, ncp=120, vmax=2.2 / 4.0, lmin=1)
+    change_points, _ = cpd_auto(
+        K, ncp=min(K.shape[0] - 1, 120), vmax=2.2 / 4.0, lmin=1)
     change_points = change_points * 15
     change_points = np.concatenate(([0], change_points, [n_frame - 1]))
 
@@ -135,6 +137,46 @@ def generate_summary(all_shot_bound, all_scores, all_nframes, all_positions):
     return all_summaries
 
 
+def transform_time(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
+
+
+def summary_format(summary, fps):
+    frames_list = []
+    start_frame = -1
+    end_frame = -1
+    is_summary_frame = False
+    for i, idx in enumerate(summary):
+        if idx:
+            if is_summary_frame is False:
+                start_frame = i
+                is_summary_frame = True
+        else:
+            if is_summary_frame:
+                end_frame = i - 1
+                frames_list.append([start_frame, end_frame])
+                is_summary_frame = False
+
+    if is_summary_frame and summary[-1] == 1:
+        end_frame = len(frame_idxes) - 1
+        frames_list.append([start_frame, end_frame])
+
+    output = []
+    for seg in frames_list:
+        output.append({
+            'frame':
+            seg,
+            'timestamps': [
+                transform_time(seg[0] / float(fps)),
+                transform_time(seg[1] / float(fps))
+            ]
+        })
+    return output
+
+
 @MODELS.register_module(
     Tasks.video_summarization, module_name=Models.video_summarization)
 class PGLVideoSummarization(TorchModel):
diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
index 89deb7ba..34eb0450 100644
--- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
+++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 import os
 
 import h5py
@@ -15,7 +18,7 @@ class VideoSummarizationDataset(TorchTaskDataset):
         self.mode = mode
         self.data_filename = os.path.join(root_dir, opt.dataset_file)
         self.split_filename = os.path.join(root_dir, opt.split_file)
-        self.split_index = opt.split_index  # it represents the current split (varies from 0 to 4)
+        self.split_index = opt.split_index
         hdf = h5py.File(self.data_filename, 'r')
         self.list_frame_features, self.list_gtscores = [], []
         self.list_user_summary = []
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 331f4816..07a14191 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -337,6 +337,22 @@ TASK_OUTPUTS = {
         OutputKeys.SCENE_META_LIST
     ],
 
+    # video summarization result for a single video
+    # {
+    #        "output":
+    #        [
+    #           {
+    #               "frame": [start_frame, end_frame]
+    #               "timestamps": [start_time, end_time]
+    #           },
+    #           {
+    #               "frame": [start_frame, end_frame]
+    #               "timestamps": [start_time, end_time]
+    #           }
+    #        ]
+    # }
+    Tasks.video_summarization: [OutputKeys.OUTPUT],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
index 25ea1e7c..e4fe206d 100644
--- a/modelscope/pipelines/cv/video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 import os.path as osp
 from typing import Any, Dict
 
@@ -8,7 +10,8 @@ import torch
 from tqdm import tqdm
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.video_summarization import PGLVideoSummarization
+from modelscope.models.cv.video_summarization import (PGLVideoSummarization,
+                                                      summary_format)
 from modelscope.models.cv.video_summarization.base_model import bvlc_googlenet
 from modelscope.models.cv.video_summarization.summarizer import (
     generate_summary, get_change_points)
@@ -57,6 +60,8 @@ class VideoSummarizationPipeline(Pipeline):
         frames = []
         picks = []
         cap = cv2.VideoCapture(input)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
         frame_idx = 0
         while (cap.isOpened()):
             ret, frame = cap.read()
@@ -89,7 +94,9 @@ class VideoSummarizationPipeline(Pipeline):
         summary = self.inference(frame_features, input['n_frame'],
                                  input['picks'], change_points)
 
-        return {OutputKeys.OUTPUT: summary}
+        output = summary_format(summary, self.fps)
+
+        return {OutputKeys.OUTPUT: output}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 6dcc31e9..1f965c53 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -3,7 +3,6 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.cv.image_utils import show_video_summarization_result
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
@@ -22,8 +21,6 @@ class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
         result = summarization_pipeline(video_path)
 
         print(f'video summarization output: \n{result}.')
-        show_video_summarization_result(video_path, result,
-                                        './summarization_result.avi')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):

From 02c913a0fee0bbb0a6ade9086c9c142f508ab3e0 Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:26:43 +0800
Subject: [PATCH 10/57] [to #42322933] add plug doc string         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10337105

---
 .../models/nlp/plug/configuration_plug.py     | 165 +++++++-----
 .../models/nlp/plug/distributed_plug.py       |  44 +++-
 modelscope/models/nlp/plug/modeling_plug.py   | 243 ++++++++----------
 3 files changed, 240 insertions(+), 212 deletions(-)

diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
index 64807392..c3a526a9 100644
--- a/modelscope/models/nlp/plug/configuration_plug.py
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -40,8 +40,6 @@ class PlugNLUConfig(PretrainedConfig):
                  max_position_embeddings=2048,
                  type_vocab_size=3,
                  initializer_range=0.00707,
-                 deep_init=False,
-                 deepspeed=False,
                  lr_decay_style='linear',
                  weight_decay=1e-2,
                  clip_grad=1.0,
@@ -53,20 +51,7 @@ class PlugNLUConfig(PretrainedConfig):
                  fp32_tokentypes=False,
                  layernorm_epsilon=1e-5,
                  dec_hidden_layers=6,
-                 pruning_method=None,
-                 pruning_mask_init='constant',
-                 pruning_mask_scale=0.0,
-                 pruning_initial_threshold=1.0,
-                 pruning_final_threshold=0.01,
-                 pruning_initial_warmup=1,
-                 pruning_final_warmup=20,
-                 pruning_module='decoder',
-                 pruning_decay_step=50,
-                 pruning_decay_type='exp',
-                 ft_module=None,
                  attn_separate=False,
-                 LR_weight_rank=8,
-                 LR_mask_rank=8,
                  **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
@@ -82,8 +67,6 @@ class PlugNLUConfig(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.deep_init = deep_init
-        self.deepspeed = deepspeed
         self.lr_decay_style = lr_decay_style
         self.weight_decay = weight_decay
         self.clip_grad = clip_grad
@@ -95,20 +78,7 @@ class PlugNLUConfig(PretrainedConfig):
         self.layernorm_epsilon = layernorm_epsilon
         self.fp32_tokentypes = fp32_tokentypes
         self.dec_hidden_layers = dec_hidden_layers
-        self.pruning_method = pruning_method
-        self.pruning_mask_init = pruning_mask_init
-        self.pruning_mask_scale = pruning_mask_scale
-        self.pruning_module = pruning_module
-        self.pruning_initial_threshold = pruning_initial_threshold
-        self.pruning_final_threshold = pruning_final_threshold
-        self.pruning_initial_warmup = pruning_initial_warmup
-        self.pruning_final_warmup = pruning_final_warmup
-        self.pruning_decay_step = pruning_decay_step
-        self.pruning_decay_type = pruning_decay_type
-        self.ft_module = ft_module
         self.attn_separate = attn_separate
-        self.LR_weight_rank = LR_weight_rank
-        self.LR_mask_rank = LR_mask_rank
 
     @classmethod
     def from_dict(cls, json_object):
@@ -148,47 +118,115 @@ class PlugNLUConfig(PretrainedConfig):
 
 
 class PlugNLGConfig(PlugNLUConfig):
+    """
+    This is the configuration class to store the configuration of a [`PlugModel`]. It is used to instantiate a
+    PLUG understanding model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PLUG
+    [PLUG](https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary) architecture.
+
+    Configuration objects inherit from [`PlugNLUConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PlugNLUConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 21504):
+            Padded vocabulary size of the PLUG model for vocab tensor parallel. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`PlugModel`].
+        original_vocab_size (`int`, *optional*, defaults to 21128):
+            True vocabulary size of the PLUG model. Defines the number of different tokens that can be represented.
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        dec_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 128):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 32768):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the Transformer Attention.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 3):
+            The vocabulary size of the `token_type_ids` passed when calling [`PlugModel`].
+        initializer_range (`float`, *optional*, defaults to 0.00707):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        lr_decay_style (`str`, *optional*, defaults to 'linear'):
+            The decay style of learning rate during fine-tunining. If string, `"linear"`, `"cosine"`, `"exponential"`,
+            `"constant"`, `"None"` are supported.
+        weight_decay (`float`, *optional*, defaults to 1e-2):
+            Decoupled weight decay to apply.
+        clip_grad (`float`, *optional*, defaults to 1.0):
+            Maximum gradient norm for gradient clipping.
+        warmup (`float`, *optional*, defaults to 0.01):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        pre_ln (`boolean`, *optional*, defaults to `True`):
+            Whether or not to apply LayerNorm to the input instead of the output in the blocks.
+        fp16 (`boolean`, *optional*, defaults to `True`):
+            Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
+        fp32_layernorm (`boolean`, *optional*, defaults to `True`):
+            Whether to use fp32 32-bit precision LayerNorm training while the argument `fp16` set to `True`.
+        fp32_embedding (`boolean`, *optional*, defaults to `False`):
+            Whether to use fp32 32-bit precision Embedding training while the argument `fp16` set to `True`.
+        fp32_tokentypes (`boolean`, *optional*, defaults to `False`):
+            Whether to use fp32 32-bit precision token types training while the argument `fp16` set to `True`.
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        attn_separate (`boolean`, *optional*, defaults to `False`):
+            Whether or not to separate query-key-value to query, key, value in the Attention.
+
+    Example:
+
+    ```python
+    >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
+    >>> # here only initializes a slice of the model on a single GPU.
+    >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
+    >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel
+
+    >>> # Initializing a Plug configuration
+    >>> configuration = PlugNLGConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = PlugModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
     model_type = 'plugNLG'
 
     def __init__(self,
                  vocab_size=21504,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
+                 original_vocab_size=21128,
+                 hidden_size=8192,
+                 num_hidden_layers=24,
+                 dec_hidden_layers=6,
+                 num_attention_heads=128,
+                 intermediate_size=32768,
                  hidden_act='gelu',
                  hidden_dropout_prob=0.1,
                  attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
+                 max_position_embeddings=2048,
+                 type_vocab_size=3,
                  initializer_range=0.00707,
-                 deep_init=False,
-                 deepspeed=False,
                  lr_decay_style='linear',
                  weight_decay=1e-2,
                  clip_grad=1.0,
                  warmup=0.01,
-                 pre_ln=False,
-                 fp16=False,
-                 fp32_layernorm=False,
+                 pre_ln=True,
+                 fp16=True,
+                 fp32_layernorm=True,
                  fp32_embedding=False,
                  fp32_tokentypes=False,
-                 layernorm_epsilon=1e-12,
-                 dec_hidden_layers=6,
-                 pruning_method=None,
-                 pruning_mask_init='constant',
-                 pruning_mask_scale=0.0,
-                 pruning_initial_threshold=1.0,
-                 pruning_final_threshold=0.01,
-                 pruning_initial_warmup=1,
-                 pruning_final_warmup=20,
-                 pruning_module='decoder',
-                 pruning_decay_step=50,
-                 pruning_decay_type='exp',
-                 ft_module=None,
+                 layernorm_epsilon=1e-5,
                  attn_separate=False,
-                 LR_weight_rank=8,
-                 LR_mask_rank=8,
                  **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
@@ -203,8 +241,6 @@ class PlugNLGConfig(PlugNLUConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.deep_init = deep_init
-        self.deepspeed = deepspeed
         self.lr_decay_style = lr_decay_style
         self.weight_decay = weight_decay
         self.clip_grad = clip_grad
@@ -216,17 +252,4 @@ class PlugNLGConfig(PlugNLUConfig):
         self.layernorm_epsilon = layernorm_epsilon
         self.fp32_tokentypes = fp32_tokentypes
         self.dec_hidden_layers = dec_hidden_layers
-        self.pruning_method = pruning_method
-        self.pruning_mask_init = pruning_mask_init
-        self.pruning_mask_scale = pruning_mask_scale
-        self.pruning_module = pruning_module
-        self.pruning_initial_threshold = pruning_initial_threshold
-        self.pruning_final_threshold = pruning_final_threshold
-        self.pruning_initial_warmup = pruning_initial_warmup
-        self.pruning_final_warmup = pruning_final_warmup
-        self.pruning_decay_step = pruning_decay_step
-        self.pruning_decay_type = pruning_decay_type
-        self.ft_module = ft_module
         self.attn_separate = attn_separate
-        self.LR_weight_rank = LR_weight_rank
-        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
index 2992f595..06009ba1 100644
--- a/modelscope/models/nlp/plug/distributed_plug.py
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -20,6 +20,48 @@ logger = get_logger(__name__)
 
 
 class DistributedPlug(TorchModel):
+    """
+    The wapper class of PLUG Model to initialize parallel environment, load model weights, generate sentences.
+    Parameters:
+        model_dir (`str`, *required*):
+            Path to model damo/nlp_plug_text-generation_27B.
+        The model structure in model_dir should be like this:
+        model_dir
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        rank (`int`, *required*):
+            Used to identify different GPUs in a tensor parallel environment. eg. The rank of GPU #0 is 0, and the
+            model file `mp_rank_00_model_states.pt` will be loaded on this GPU.
+        world_size (`int`, *required*, defaults to 8):
+            The parallel size in total.
+        model_parallel_size (`int`, *required*, defaults to 8):
+            The parallel size of model(tensor parallel).
+        master_ip (`str`, *required*):
+            The master IP, can usually be set to `"127.0.0.1"`, used as part of
+            [`~torch.distributed.init_process_group`] method parameter `init_method`.
+            `init_method` = `"tcp://{master_ip}:{master_port}"`
+        master_port (`str`, *required*):
+            The master port, can usually be set to `"29500"`, used as part of
+            [`~torch.distributed.init_process_group`] method parameter `init_method`.
+            `init_method` = `"tcp://{master_ip}:{master_port}"`
+        seed (`int`, *optional*, defaults to 42):
+            Random seed to control sampling.
+    """
 
     def __init__(self, model_dir, rank, **kwargs):
         super().__init__(model_dir, **kwargs)
@@ -29,7 +71,7 @@ class DistributedPlug(TorchModel):
         initialize_distributed(rank, mpu, kwargs['world_size'],
                                kwargs['model_parallel_size'],
                                kwargs['master_ip'], kwargs['master_port'])
-        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        seed = 42 if 'seed' not in kwargs else kwargs['seed']
         set_random_seed_mpu(seed)
         self.iteration = 0
         self.dist_model = self.initialize_model(path_load_tag='model')
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
index 9d2bb14f..df00006b 100644
--- a/modelscope/models/nlp/plug/modeling_plug.py
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -152,15 +152,7 @@ class BertSelfOutput(nn.Module):
             bias=True,
             input_is_parallel=True,
             stride=1,
-            init_method=init_method,
-            pruning_method=config.pruning_method if config.pruning_module in [
-                'all', 'encoder', 'encoder_self', 'encoder_selfvo',
-                'encoder_selfo'
-            ] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            init_method=init_method)
         self.fp32_layernorm = config.fp32_layernorm
         if not config.pre_ln:
             self.LayerNorm = BertLayerNorm(
@@ -173,12 +165,8 @@ class BertSelfOutput(nn.Module):
         self,
         hidden_states,
         input_tensor,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         ln_input = hidden_states + input_tensor
         if self.LayerNorm is not None:
@@ -210,20 +198,13 @@ class BertAttention(nn.Module):
             output_parallel=True,
             init_method=normal_init_method(
                 mean=0.0, std=config.initializer_range),
-            separate=config.attn_separate,
-            pruning_method=config.pruning_method,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            pruning_module=config.pruning_module,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            separate=config.attn_separate)
         self.output = BertSelfOutput(config)
 
     def forward(
         self,
         input_tensor,
         attention_mask,
-        pruning_threshold=None,
     ):
         if self.LayerNorm is not None:
             ln_input = input_tensor
@@ -236,20 +217,16 @@ class BertAttention(nn.Module):
             self_output = self.self(
                 ln_output,
                 attention_mask,
-                pruning_threshold=pruning_threshold,
             )
         else:
             self_output = self.self(
                 input_tensor,
                 attention_mask,
-                pruning_threshold=pruning_threshold,
             )
-        output_pruning_threshold = pruning_threshold
 
         attention_output = self.output(
             self_output,
             input_tensor,
-            pruning_threshold=output_pruning_threshold,
         )
         return attention_output
 
@@ -265,25 +242,15 @@ class BertIntermediate(nn.Module):
             gather_output=False,
             stride=1,
             init_method=normal_init_method(
-                mean=0.0, std=config.initializer_range),
-            pruning_method=config.pruning_method if config.pruning_module
-            in ['all', 'encoder', 'encoder_ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+                mean=0.0, std=config.initializer_range))
         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
             if isinstance(config.hidden_act, str) else config.hidden_act
 
     def forward(
         self,
         hidden_states,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
@@ -306,13 +273,7 @@ class BertOutput(nn.Module):
             bias=True,
             input_is_parallel=True,
             stride=1,
-            init_method=init_method,
-            pruning_method=config.pruning_method if config.pruning_module
-            in ['all', 'encoder', 'encoder_ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            init_method=init_method)
         self.fp32_layernorm = config.fp32_layernorm
         if not config.pre_ln:
             self.LayerNorm = BertLayerNorm(
@@ -325,12 +286,8 @@ class BertOutput(nn.Module):
         self,
         hidden_states,
         input_tensor,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         ln_input = hidden_states + input_tensor
         if self.LayerNorm is not None:
@@ -359,14 +316,8 @@ class BertLayer(nn.Module):
         else:
             self.LayerNorm = None
 
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        pruning_threshold=None,
-    ):
-        attention_output = self.attention(
-            hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
         if self.LayerNorm is not None:
             ln_input = attention_output
             previous_type = attention_output.type()
@@ -375,15 +326,10 @@ class BertLayer(nn.Module):
             ln_output = self.LayerNorm(ln_input)
             if self.fp32_layernorm:
                 ln_output = ln_output.type(previous_type)
-            intermediate_output = self.intermediate(
-                ln_output, pruning_threshold=pruning_threshold)
+            intermediate_output = self.intermediate(ln_output)
         else:
-            intermediate_output = self.intermediate(
-                attention_output, pruning_threshold=pruning_threshold)
-        layer_output = self.output(
-            intermediate_output,
-            attention_output,
-            pruning_threshold=pruning_threshold)
+            intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
         return layer_output
 
 
@@ -407,7 +353,6 @@ class BertEncoder(nn.Module):
         output_all_encoded_layers=True,
         checkpoint_activations=False,
         detach_index=-1,
-        pruning_threshold=None,
     ):
         all_encoder_layers = []
 
@@ -417,8 +362,7 @@ class BertEncoder(nn.Module):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
                 for layer in layers:
-                    x_ = layer(
-                        x_, inputs[1], pruning_threshold=pruning_threshold)
+                    x_ = layer(x_, inputs[1])
                 return x_
 
             return custom_forward
@@ -654,7 +598,6 @@ class BertModel(PreTrainedBertModel):
         output_all_encoded_layers=True,
         checkpoint_activations=False,
         detach_index=-1,
-        pruning_threshold=None,
     ):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -683,8 +626,7 @@ class BertModel(PreTrainedBertModel):
             extended_attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
             checkpoint_activations=checkpoint_activations,
-            detach_index=detach_index,
-            pruning_threshold=pruning_threshold)
+            detach_index=detach_index)
         sequence_output = encoded_layers[-1]
         for p in self.pooler.parameters():
             if p is None:
@@ -709,18 +651,6 @@ class DecodeLayer(nn.Module):
             std=config.initializer_range,
             num_layers=config.num_hidden_layers)
 
-        self_pruning_method = config.pruning_method
-        cross_pruning_method = config.pruning_method
-        ffn_pruning_method = config.pruning_method
-
-        if config.ft_module is not None:
-            if 'decoder_self' in config.ft_module:
-                self_pruning_method = 'finetune'
-            if 'decoder_cross' in config.ft_module:
-                cross_pruning_method = 'finetune'
-            if 'decoder_ffn' in config.ft_module:
-                ffn_pruning_method = 'finetune'
-
         self.attention = mpu.GPT2ParallelSelfAttention(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
@@ -728,13 +658,6 @@ class DecodeLayer(nn.Module):
             output_dropout_prob=config.hidden_dropout_prob,
             init_method=init_method,
             output_layer_init_method=output_layer_init_method,
-            pruning_method=self_pruning_method if config.pruning_module in [
-                'all', 'decoder', 'decoder_self', 'decoder_self+ffn'
-            ] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.cross_attention = mpu.PalmParallelCrossAttention(
@@ -745,12 +668,6 @@ class DecodeLayer(nn.Module):
             init_method=init_method,
             attn_separate=False,
             output_layer_init_method=output_layer_init_method,
-            pruning_method=cross_pruning_method,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            pruning_module=config.pruning_module,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.input_layernorm = BertLayerNorm(
@@ -765,12 +682,6 @@ class DecodeLayer(nn.Module):
             config.intermediate_size,
             gather_output=False,
             init_method=init_method,
-            pruning_method=ffn_pruning_method if config.pruning_module
-            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
             if isinstance(config.hidden_act, str) else config.hidden_act
@@ -779,12 +690,6 @@ class DecodeLayer(nn.Module):
             config.hidden_size,
             input_is_parallel=True,
             init_method=output_layer_init_method,
-            pruning_method=ffn_pruning_method if config.pruning_module
-            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
@@ -804,8 +709,7 @@ class DecodeLayer(nn.Module):
                 enc_hidden_states,
                 enc_attn_mask,
                 dec_attn_mask,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
         residual = hidden_states
         previous_type = hidden_states.type()
         hidden_states = self.input_layernorm(
@@ -813,10 +717,7 @@ class DecodeLayer(nn.Module):
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
         hidden_states = self.attention(
-            hidden_states,
-            dec_attn_mask,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            hidden_states, dec_attn_mask, is_infer=is_infer)
 
         hidden_states = residual + hidden_states
 
@@ -825,23 +726,18 @@ class DecodeLayer(nn.Module):
             self.type_converter(hidden_states))
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
-        hidden_states = self.cross_attention(
-            hidden_states,
-            enc_hidden_states,
-            enc_attn_mask,
-            pruning_threshold=pruning_threshold)
+        hidden_states = self.cross_attention(hidden_states, enc_hidden_states,
+                                             enc_attn_mask)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_cross_attention_layernorm(
             self.type_converter(hidden_states))
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
-        hidden_states = self.intermediate(
-            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
-        hidden_states = self.output(
-            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.output(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = residual + hidden_states
 
@@ -866,8 +762,7 @@ class BertDecoder(nn.Module):
                 dec_attn_mask,
                 checkpoint_activations=False,
                 output_all_encoded_layers=False,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
 
         def custom(start, end):
 
@@ -880,8 +775,7 @@ class BertDecoder(nn.Module):
                         inputs[1],
                         inputs[2],
                         dec_attn_mask * 1,
-                        is_infer=is_infer,
-                        pruning_threshold=pruning_threshold)
+                        is_infer=is_infer)
                 return x_
 
             return custom_forward
@@ -904,8 +798,7 @@ class BertDecoder(nn.Module):
                     enc_hidden_states,
                     enc_attn_mask,
                     dec_attn_mask,
-                    is_infer=is_infer,
-                    pruning_threshold=pruning_threshold)
+                    is_infer=is_infer)
 
         previous_type = hidden_states.type()
         if self.fp32_layernorm:
@@ -932,8 +825,7 @@ class DecodeModel(PreTrainedBertModel):
                 enc_attn_mask=None,
                 dec_attn_mask=None,
                 checkpoint_activations=False,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
         extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
         extended_attention_mask = extended_attention_mask.to(
             dtype=next(self.decoder.parameters()).dtype)  # fp16 compatibility
@@ -946,8 +838,7 @@ class DecodeModel(PreTrainedBertModel):
             extended_attention_mask,
             dec_attn_mask,
             checkpoint_activations=False,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            is_infer=is_infer)
         return sequence_output[-1]
 
 
@@ -972,16 +863,14 @@ class PalmForPreTraining(PreTrainedBertModel):
                 checkpoint_activations=False,
                 is_infer=False,
                 sequence_output=None,
-                parallel_output=True,
-                pruning_threshold=None):
+                parallel_output=True):
         if sequence_output is None:
             sequence_output, pooled_output = self.bert(
                 input_ids,
                 token_type_ids,
                 attention_mask,
                 output_all_encoded_layers=False,
-                checkpoint_activations=checkpoint_activations,
-                pruning_threshold=pruning_threshold)
+                checkpoint_activations=checkpoint_activations)
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output)
         else:
@@ -998,8 +887,7 @@ class PalmForPreTraining(PreTrainedBertModel):
             attention_mask,
             decode_attention_mask,
             checkpoint_activations=checkpoint_activations,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            is_infer=is_infer)
 
         transformer_output_parallel = mpu.copy_to_model_parallel_region(
             decode_output)
@@ -1017,6 +905,29 @@ class PalmForPreTraining(PreTrainedBertModel):
 
 
 class PlugModel(torch.nn.Module):
+    """
+    The bare Plug Model transformer outputting raw hidden-states without any specific head on top.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PlugNLGConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~DistributedPlug.initialize_model`] method to load the model weights.
+    Example:
+
+    ```python
+    >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
+    >>> # here only initializes a slice of the model on a single GPU.
+    >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
+    >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel
+
+    >>> # Initializing a Plug configuration
+    >>> configuration = PlugNLGConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = PlugModel(configuration)
+    """
 
     def __init__(self, config):
         super(PlugModel, self).__init__()
@@ -1034,6 +945,58 @@ class PlugModel(torch.nn.Module):
                 is_infer=False,
                 sequence_output=None,
                 parallel_output=True):
+        """
+        Parameters:
+            input_tokens (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`):
+                `input_tokens_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using transformers [`BertTokenizer`]. See
+                [`TextGenerationPreprocessor.__call__`] for details.
+            token_type_ids (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`, *optional*, defaults to
+            None):
+               Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+               1]`:
+
+               - 0 corresponds to a *sentence A* token,
+               - 1 corresponds to a *sentence B* token.
+
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            target_tokens (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Target token ids(labels) for language modeling. Note that the labels **are shifted** inside the model,
+                i.e. you can set `target_tokens = input_tokens` Indices are selected in
+                `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only
+                computed for labels in `[0, ..., config.vocab_size]`
+
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.max_position_embeddings - 1]`.
+
+            decode_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults
+            to None):
+                Mask to avoid performing attention on padding token indices of target tokens. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            checkpoint_activations (`boolean`, *optional*, defaults to `False`):
+                Whether gradient checkpointing is activated for this model or not.
+            is_infer (`boolean`, *optional*, defaults to `False`):
+                Whether or not to perform single inference.
+            sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*,
+            defaults to None):
+                Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the
+                model. A single forward() call can produce one single token. To generate the current token, the
+                sequence_output generated by the `forward()` of the previous token is required.
+            parallel_output (`boolean`, *optional*, defaults to `True`):
+                To parallel return output, or gather it before return.
+
+
+        """
         return self.model(
             input_tokens,
             token_type_ids,

From 69da8f91ac5ca420408100c4ec5abd0c5987e65a Mon Sep 17 00:00:00 2001
From: "ashui.cbh" <ashui.cbh@alibaba-inc.com>
Date: Tue, 11 Oct 2022 20:49:13 +0800
Subject: [PATCH 11/57] [to #42322933]suport image inpainting         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10111615

---
 .../image_inpainting/image_inpainting.png     |   3 +
 .../image_inpainting_mask.png                 |   3 +
 modelscope/metainfo.py                        |   5 +
 modelscope/metrics/__init__.py                |   2 +
 modelscope/metrics/builder.py                 |   2 +
 modelscope/metrics/image_inpainting_metric.py | 210 +++++++
 modelscope/models/cv/__init__.py              |  17 +-
 .../models/cv/crowd_counting/cc_model.py      |   2 +
 .../cv/crowd_counting/hrnet_aspp_relu.py      |  14 +-
 .../models/cv/image_inpainting/__init__.py    |  22 +
 modelscope/models/cv/image_inpainting/base.py |  75 +++
 .../models/cv/image_inpainting/default.py     | 210 +++++++
 .../models/cv/image_inpainting/model.py       |  36 ++
 .../cv/image_inpainting/modules/__init__.py   |   0
 .../modules/ade20k/__init__.py                |   2 +
 .../image_inpainting/modules/ade20k/base.py   | 380 +++++++++++
 .../image_inpainting/modules/ade20k/resnet.py | 183 ++++++
 .../image_inpainting/modules/adversarial.py   | 167 +++++
 .../modules/feature_matching.py               |  45 ++
 .../models/cv/image_inpainting/modules/ffc.py | 588 ++++++++++++++++++
 .../cv/image_inpainting/modules/inception.py  | 324 ++++++++++
 .../cv/image_inpainting/modules/perceptual.py |  47 ++
 .../cv/image_inpainting/modules/pix2pixhd.py  |  75 +++
 .../models/cv/image_inpainting/refinement.py  | 393 ++++++++++++
 .../msdatasets/task_datasets/__init__.py      |   2 +
 .../image_inpainting/__init__.py              |   2 +
 .../task_datasets/image_inpainting/aug.py     | 100 +++
 .../image_inpainting_dataset.py               | 337 ++++++++++
 modelscope/outputs.py                         |   1 +
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../pipelines/cv/image_inpainting_pipeline.py | 146 +++++
 modelscope/trainers/__init__.py               |   5 +-
 modelscope/trainers/cv/__init__.py            |   4 +-
 .../trainers/cv/image_inpainting_trainer.py   | 111 ++++
 modelscope/utils/constant.py                  |   4 +-
 requirements/cv.txt                           |   2 +
 tests/pipelines/test_image_inpainting.py      |  77 +++
 tests/run_config.yaml                         |   1 +
 .../trainers/test_image_inpainting_trainer.py |  84 +++
 40 files changed, 3666 insertions(+), 19 deletions(-)
 create mode 100644 data/test/images/image_inpainting/image_inpainting.png
 create mode 100644 data/test/images/image_inpainting/image_inpainting_mask.png
 create mode 100644 modelscope/metrics/image_inpainting_metric.py
 create mode 100644 modelscope/models/cv/image_inpainting/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/base.py
 create mode 100644 modelscope/models/cv/image_inpainting/default.py
 create mode 100644 modelscope/models/cv/image_inpainting/model.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/base.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/adversarial.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/feature_matching.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ffc.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/inception.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/perceptual.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
 create mode 100644 modelscope/models/cv/image_inpainting/refinement.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/aug.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
 create mode 100644 modelscope/pipelines/cv/image_inpainting_pipeline.py
 create mode 100644 modelscope/trainers/cv/image_inpainting_trainer.py
 create mode 100644 tests/pipelines/test_image_inpainting.py
 create mode 100644 tests/trainers/test_image_inpainting_trainer.py

diff --git a/data/test/images/image_inpainting/image_inpainting.png b/data/test/images/image_inpainting/image_inpainting.png
new file mode 100644
index 00000000..e141012d
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5
+size 2747938
diff --git a/data/test/images/image_inpainting/image_inpainting_mask.png b/data/test/images/image_inpainting/image_inpainting_mask.png
new file mode 100644
index 00000000..e30f67e7
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting_mask.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15
+size 27616
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 77627abc..cae9d188 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -27,6 +27,7 @@ class Models(object):
     face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
+    image_inpainting = 'FFTInpainting'
     video_summarization = 'pgl-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
@@ -179,6 +180,7 @@ class Pipelines(object):
     video_summarization = 'googlenet_pgl_video_summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    image_inpainting = 'fft-inpainting'
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
@@ -264,6 +266,7 @@ class Trainers(object):
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
     movie_scene_segmentation = 'movie-scene-segmentation'
+    image_inpainting = 'image-inpainting'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -363,6 +366,8 @@ class Metrics(object):
     video_summarization_metric = 'video-summarization-metric'
     # metric for movie-scene-segmentation task
     movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
+    # metric for inpainting task
+    image_inpainting_metric = 'image-inpainting-metric'
 
 
 class Optimizers(object):
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index d3975a2c..e6a03a22 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
     from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
+    from .image_inpainting_metric import ImageInpaintingMetric
 
 else:
     _import_structure = {
@@ -34,6 +35,7 @@ else:
         'token_classification_metric': ['TokenClassificationMetric'],
         'video_summarization_metric': ['VideoSummarizationMetric'],
         'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
+        'image_inpainting_metric': ['ImageInpaintingMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 9e875cc4..ee4d2840 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -18,6 +18,7 @@ class MetricKeys(object):
     SSIM = 'ssim'
     AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
+    FID = 'fid'
     BLEU_1 = 'bleu-1'
     BLEU_4 = 'bleu-4'
     ROUGE_1 = 'rouge-1'
@@ -39,6 +40,7 @@ task_default_metrics = {
     Tasks.image_captioning: [Metrics.text_gen_metric],
     Tasks.visual_question_answering: [Metrics.text_gen_metric],
     Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
+    Tasks.image_inpainting: [Metrics.image_inpainting_metric],
 }
 
 
diff --git a/modelscope/metrics/image_inpainting_metric.py b/modelscope/metrics/image_inpainting_metric.py
new file mode 100644
index 00000000..954d4ca2
--- /dev/null
+++ b/modelscope/metrics/image_inpainting_metric.py
@@ -0,0 +1,210 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import linalg
+
+from modelscope.metainfo import Metrics
+from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+def fid_calculate_activation_statistics(act):
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+
+def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6):
+    mu1, sigma1 = fid_calculate_activation_statistics(activations_pred)
+    mu2, sigma2 = fid_calculate_activation_statistics(activations_target)
+
+    diff = mu1 - mu2
+
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2)
+            - 2 * tr_covmean)
+
+
+class FIDScore(torch.nn.Module):
+
+    def __init__(self, dims=2048, eps=1e-6):
+        super().__init__()
+        if getattr(FIDScore, '_MODEL', None) is None:
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            FIDScore._MODEL = InceptionV3([block_idx]).eval()
+        self.model = FIDScore._MODEL
+        self.eps = eps
+        self.reset()
+
+    def forward(self, pred_batch, target_batch, mask=None):
+        activations_pred = self._get_activations(pred_batch)
+        activations_target = self._get_activations(target_batch)
+
+        self.activations_pred.append(activations_pred.detach().cpu())
+        self.activations_target.append(activations_target.detach().cpu())
+
+    def get_value(self):
+        activations_pred, activations_target = (self.activations_pred,
+                                                self.activations_target)
+        activations_pred = torch.cat(activations_pred).cpu().numpy()
+        activations_target = torch.cat(activations_target).cpu().numpy()
+
+        total_distance = calculate_frechet_distance(
+            activations_pred, activations_target, eps=self.eps)
+
+        self.reset()
+        return total_distance
+
+    def reset(self):
+        self.activations_pred = []
+        self.activations_target = []
+
+    def _get_activations(self, batch):
+        activations = self.model(batch)[0]
+        if activations.shape[2] != 1 or activations.shape[3] != 1:
+            assert False, \
+                'We should not have got here, because Inception always scales inputs to 299x299'
+        activations = activations.squeeze(-1).squeeze(-1)
+        return activations
+
+
+class SSIM(torch.nn.Module):
+    """SSIM. Modified from:
+    https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
+    """
+
+    def __init__(self, window_size=11, size_average=True):
+        super().__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.register_buffer('window',
+                             self._create_window(window_size, self.channel))
+
+    def forward(self, img1, img2):
+        assert len(img1.shape) == 4
+
+        channel = img1.size()[1]
+
+        if channel == self.channel and self.window.data.type(
+        ) == img1.data.type():
+            window = self.window
+        else:
+            window = self._create_window(self.window_size, channel)
+
+            window = window.type_as(img1)
+
+            self.window = window
+            self.channel = channel
+
+        return self._ssim(img1, img2, window, self.window_size, channel,
+                          self.size_average)
+
+    def _gaussian(self, window_size, sigma):
+        gauss = torch.Tensor([
+            np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2))
+            for x in range(window_size)
+        ])
+        return gauss / gauss.sum()
+
+    def _create_window(self, window_size, channel):
+        _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1)
+        _2D_window = _1D_window.mm(
+            _1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+        return _2D_window.expand(channel, 1, window_size,
+                                 window_size).contiguous()
+
+    def _ssim(self,
+              img1,
+              img2,
+              window,
+              window_size,
+              channel,
+              size_average=True):
+        mu1 = F.conv2d(
+            img1, window, padding=(window_size // 2), groups=channel)
+        mu2 = F.conv2d(
+            img2, window, padding=(window_size // 2), groups=channel)
+
+        mu1_sq = mu1.pow(2)
+        mu2_sq = mu2.pow(2)
+        mu1_mu2 = mu1 * mu2
+
+        sigma1_sq = F.conv2d(
+            img1 * img1, window, padding=(window_size // 2),
+            groups=channel) - mu1_sq
+        sigma2_sq = F.conv2d(
+            img2 * img2, window, padding=(window_size // 2),
+            groups=channel) - mu2_sq
+        sigma12 = F.conv2d(
+            img1 * img2, window, padding=(window_size // 2),
+            groups=channel) - mu1_mu2
+
+        C1 = 0.01**2
+        C2 = 0.03**2
+
+        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
+                   ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+        if size_average:
+            return ssim_map.mean()
+
+        return ssim_map.mean(1).mean(1).mean(1)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        return
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.image_inpainting_metric)
+class ImageInpaintingMetric(Metric):
+    """The metric computation class for image inpainting classes.
+    """
+
+    def __init__(self):
+        self.preds = []
+        self.targets = []
+        self.SSIM = SSIM(window_size=11, size_average=False).eval()
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.FID = FIDScore().to(device)
+
+    def add(self, outputs: Dict, inputs: Dict):
+        pred = outputs['inpainted']
+        target = inputs['image']
+        self.preds.append(torch_nested_detach(pred))
+        self.targets.append(torch_nested_detach(target))
+
+    def evaluate(self):
+        ssim_list = []
+        for (pred, target) in zip(self.preds, self.targets):
+            ssim_list.append(self.SSIM(pred, target))
+            self.FID(pred, target)
+        ssim_list = torch_nested_numpify(ssim_list)
+        fid = self.FID.get_value()
+        return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index f2798b59..ba7b03c5 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -5,13 +5,14 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
                crowd_counting, face_2d_keypoints, face_detection,
                face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_instance_segmentation,
-               image_panoptic_segmentation, image_portrait_enhancement,
-               image_reid_person, image_semantic_segmentation,
-               image_to_image_generation, image_to_image_translation,
-               movie_scene_segmentation, object_detection,
-               product_retrieval_embedding, realtime_object_detection,
-               salient_detection, shop_segmentation, super_resolution,
-               video_single_object_tracking, video_summarization, virual_tryon)
+               image_colorization, image_denoise, image_inpainting,
+               image_instance_segmentation, image_panoptic_segmentation,
+               image_portrait_enhancement, image_reid_person,
+               image_semantic_segmentation, image_to_image_generation,
+               image_to_image_translation, movie_scene_segmentation,
+               object_detection, product_retrieval_embedding,
+               realtime_object_detection, salient_detection, shop_segmentation,
+               super_resolution, video_single_object_tracking,
+               video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/crowd_counting/cc_model.py b/modelscope/models/cv/crowd_counting/cc_model.py
index 582b26f4..16fbc261 100644
--- a/modelscope/models/cv/crowd_counting/cc_model.py
+++ b/modelscope/models/cv/crowd_counting/cc_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
index 982ba939..0d1bd3ca 100644
--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -1,10 +1,10 @@
-# ------------------------------------------------------------------------------
-# Copyright (c) Microsoft
-# Licensed under the MIT License.
-# Written by Bin Xiao (Bin.Xiao@microsoft.com)
-# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
-# https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
-# ------------------------------------------------------------------------------
+"""
+Copyright (c) Microsoft
+Licensed under the MIT License.
+Written by Bin Xiao (Bin.Xiao@microsoft.com)
+Modified by Ke Sun (sunk@mail.ustc.edu.cn)
+https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
+"""
 
 import functools
 import logging
diff --git a/modelscope/models/cv/image_inpainting/__init__.py b/modelscope/models/cv/image_inpainting/__init__.py
new file mode 100644
index 00000000..e7c63cd4
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import FFTInpainting
+
+else:
+    _import_structure = {
+        'model': ['FFTInpainting'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_inpainting/base.py b/modelscope/models/cv/image_inpainting/base.py
new file mode 100644
index 00000000..04e73630
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/base.py
@@ -0,0 +1,75 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.utils.logger import get_logger
+from .modules.adversarial import NonSaturatingWithR1
+from .modules.ffc import FFCResNetGenerator
+from .modules.perceptual import ResNetPL
+from .modules.pix2pixhd import NLayerDiscriminator
+
+LOGGER = get_logger()
+
+
+class BaseInpaintingTrainingModule(nn.Module):
+
+    def __init__(self,
+                 model_dir='',
+                 use_ddp=True,
+                 predict_only=False,
+                 visualize_each_iters=100,
+                 average_generator=False,
+                 generator_avg_beta=0.999,
+                 average_generator_start_step=30000,
+                 average_generator_period=10,
+                 store_discr_outputs_for_vis=False,
+                 **kwargs):
+        super().__init__()
+        LOGGER.info(
+            f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}'
+        )
+
+        self.generator = FFCResNetGenerator()
+        self.use_ddp = use_ddp
+
+        if not predict_only:
+            self.discriminator = NLayerDiscriminator()
+            self.adversarial_loss = NonSaturatingWithR1(
+                weight=10,
+                gp_coef=0.001,
+                mask_as_fake_target=True,
+                allow_scale_mask=True)
+
+            self.average_generator = average_generator
+            self.generator_avg_beta = generator_avg_beta
+            self.average_generator_start_step = average_generator_start_step
+            self.average_generator_period = average_generator_period
+            self.generator_average = None
+            self.last_generator_averaging_step = -1
+            self.store_discr_outputs_for_vis = store_discr_outputs_for_vis
+
+            self.loss_l1 = nn.L1Loss(reduction='none')
+
+            self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir)
+
+        self.visualize_each_iters = visualize_each_iters
+        LOGGER.info('BaseInpaintingTrainingModule init done')
+
+    def forward(self, batch: Dict[str,
+                                  torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
+        raise NotImplementedError()
+
+    def generator_loss(self,
+                       batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
+
+    def discriminator_loss(
+            self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
diff --git a/modelscope/models/cv/image_inpainting/default.py b/modelscope/models/cv/image_inpainting/default.py
new file mode 100644
index 00000000..5f57d63f
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/default.py
@@ -0,0 +1,210 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import bisect
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.utils.logger import get_logger
+from .base import BaseInpaintingTrainingModule
+from .modules.feature_matching import feature_matching_loss, masked_l1_loss
+
+LOGGER = get_logger()
+
+
+def set_requires_grad(module, value):
+    for param in module.parameters():
+        param.requires_grad = value
+
+
+def add_prefix_to_keys(dct, prefix):
+    return {prefix + k: v for k, v in dct.items()}
+
+
+class LinearRamp:
+
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+
+
+class LadderRamp:
+
+    def __init__(self, start_iters, values):
+        self.start_iters = start_iters
+        self.values = values
+        assert len(values) == len(start_iters) + 1, (len(values),
+                                                     len(start_iters))
+
+    def __call__(self, i):
+        segment_i = bisect.bisect_right(self.start_iters, i)
+        return self.values[segment_i]
+
+
+def get_ramp(kind='ladder', **kwargs):
+    if kind == 'linear':
+        return LinearRamp(**kwargs)
+    if kind == 'ladder':
+        return LadderRamp(**kwargs)
+    raise ValueError(f'Unexpected ramp kind: {kind}')
+
+
+class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
+
+    def __init__(self,
+                 model_dir='',
+                 predict_only=False,
+                 concat_mask=True,
+                 rescale_scheduler_kwargs=None,
+                 image_to_discriminator='predicted_image',
+                 add_noise_kwargs=None,
+                 noise_fill_hole=False,
+                 const_area_crop_kwargs=None,
+                 distance_weighter_kwargs=None,
+                 distance_weighted_mask_for_discr=False,
+                 fake_fakes_proba=0,
+                 fake_fakes_generator_kwargs=None,
+                 **kwargs):
+        super().__init__(model_dir=model_dir, predict_only=predict_only)
+        self.concat_mask = concat_mask
+        self.rescale_size_getter = get_ramp(
+            **rescale_scheduler_kwargs
+        ) if rescale_scheduler_kwargs is not None else None
+        self.image_to_discriminator = image_to_discriminator
+        self.add_noise_kwargs = add_noise_kwargs
+        self.noise_fill_hole = noise_fill_hole
+        self.const_area_crop_kwargs = const_area_crop_kwargs
+        self.refine_mask_for_losses = None
+        self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
+
+        self.feature_matching_weight = 100
+        self.losses_l1_weight_known = 10
+        self.losses_l1_weight_missing = 0
+        self.fake_fakes_proba = fake_fakes_proba
+
+    def forward(self, batch):
+        img = batch['image']
+        mask = batch['mask']
+
+        masked_img = img * (1 - mask)
+
+        if self.concat_mask:
+            masked_img = torch.cat([masked_img, mask], dim=1)
+
+        batch['predicted_image'] = self.generator(masked_img)
+        batch['inpainted'] = mask * batch['predicted_image'] + (
+            1 - mask) * batch['image']
+
+        batch['mask_for_losses'] = mask
+
+        return batch
+
+    def generator_loss(self, batch):
+        img = batch['image']
+        predicted_img = batch[self.image_to_discriminator]
+        original_mask = batch['mask']
+        supervised_mask = batch['mask_for_losses']
+
+        # L1
+        l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
+                                  self.losses_l1_weight_known,
+                                  self.losses_l1_weight_missing)
+
+        total_loss = l1_value
+        metrics = dict(gen_l1=l1_value)
+
+        # discriminator
+        # adversarial_loss calls backward by itself
+        mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
+        self.adversarial_loss.pre_generator_step(
+            real_batch=img,
+            fake_batch=predicted_img,
+            generator=self.generator,
+            discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(img)
+        discr_fake_pred, discr_fake_features = self.discriminator(
+            predicted_img)
+        adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(
+            real_batch=img,
+            fake_batch=predicted_img,
+            discr_real_pred=discr_real_pred,
+            discr_fake_pred=discr_fake_pred,
+            mask=mask_for_discr)
+        total_loss = total_loss + adv_gen_loss
+        metrics['gen_adv'] = adv_gen_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+
+        # feature matching
+        if self.feature_matching_weight > 0:
+            need_mask_in_fm = False
+            mask_for_fm = supervised_mask if need_mask_in_fm else None
+            fm_value = feature_matching_loss(
+                discr_fake_features, discr_real_features,
+                mask=mask_for_fm) * self.feature_matching_weight
+            total_loss = total_loss + fm_value
+            metrics['gen_fm'] = fm_value
+
+        if self.loss_resnet_pl is not None:
+            resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
+            total_loss = total_loss + resnet_pl_value
+            metrics['gen_resnet_pl'] = resnet_pl_value
+
+        return total_loss, metrics
+
+    def discriminator_loss(self, batch):
+        total_loss = 0
+        metrics = {}
+
+        predicted_img = batch[self.image_to_discriminator].detach()
+        self.adversarial_loss.pre_discriminator_step(
+            real_batch=batch['image'],
+            fake_batch=predicted_img,
+            generator=self.generator,
+            discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(
+            batch['image'])
+        discr_fake_pred, discr_fake_features = self.discriminator(
+            predicted_img)
+        adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(
+            real_batch=batch['image'],
+            fake_batch=predicted_img,
+            discr_real_pred=discr_real_pred,
+            discr_fake_pred=discr_fake_pred,
+            mask=batch['mask'])
+
+        total_loss = (total_loss + adv_discr_loss) * 0.1
+        metrics['discr_adv'] = adv_discr_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+
+        return total_loss, metrics
+
+    def _do_step(self, batch, optimizer_idx=None):
+        if optimizer_idx == 0:  # step for generator
+            set_requires_grad(self.generator, True)
+            set_requires_grad(self.discriminator, False)
+        elif optimizer_idx == 1:  # step for discriminator
+            set_requires_grad(self.generator, False)
+            set_requires_grad(self.discriminator, True)
+
+        batch = self(batch)
+        total_loss = 0
+        if optimizer_idx is None or optimizer_idx == 0:  # step for generator
+            total_loss, metrics = self.generator_loss(batch)
+
+        elif optimizer_idx is None or optimizer_idx == 1:  # step for discriminator
+            total_loss, metrics = self.discriminator_loss(batch)
+
+        result = dict(loss=total_loss)
+        return result
diff --git a/modelscope/models/cv/image_inpainting/model.py b/modelscope/models/cv/image_inpainting/model.py
new file mode 100644
index 00000000..b12f6edd
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/model.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+LOGGER = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.image_inpainting, module_name=Models.image_inpainting)
+class FFTInpainting(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        super().__init__(model_dir, **kwargs)
+
+        from .default import DefaultInpaintingTrainingModule
+        pretrained = kwargs.get('pretrained', True)
+        predict_only = kwargs.get('predict_only', False)
+        net = DefaultInpaintingTrainingModule(
+            model_dir=model_dir, predict_only=predict_only)
+        if pretrained:
+            path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+            LOGGER.info(f'loading pretrained model from {path}')
+            state = torch.load(path, map_location='cpu')
+            net.load_state_dict(state, strict=False)
+        self.model = net
+
+    def forward(self, inputs):
+        return self.model(inputs)
diff --git a/modelscope/models/cv/image_inpainting/modules/__init__.py b/modelscope/models/cv/image_inpainting/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
new file mode 100644
index 00000000..89c3e293
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .base import ModelBuilder
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/base.py b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
new file mode 100644
index 00000000..02bd3cc4
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
@@ -0,0 +1,380 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules import BatchNorm2d
+
+from . import resnet
+
+NUM_CLASS = 150
+
+
+# Model Builder
+class ModelBuilder:
+    # custom weights initialization
+    @staticmethod
+    def weights_init(m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    @staticmethod
+    def build_encoder(arch='resnet50dilated',
+                      fc_dim=512,
+                      weights='',
+                      model_dir=''):
+        pretrained = True if len(weights) == 0 else False
+        arch = arch.lower()
+        if arch == 'resnet50dilated':
+            orig_resnet = resnet.__dict__['resnet50'](
+                pretrained=pretrained, model_dir=model_dir)
+            net_encoder = ResnetDilated(orig_resnet, dilate_scale=8)
+        elif arch == 'resnet50':
+            orig_resnet = resnet.__dict__['resnet50'](
+                pretrained=pretrained, model_dir=model_dir)
+            net_encoder = Resnet(orig_resnet)
+        else:
+            raise Exception('Architecture undefined!')
+
+        # encoders are usually pretrained
+        # net_encoder.apply(ModelBuilder.weights_init)
+        if len(weights) > 0:
+            print('Loading weights for net_encoder')
+            net_encoder.load_state_dict(
+                torch.load(weights, map_location=lambda storage, loc: storage),
+                strict=False)
+        return net_encoder
+
+    @staticmethod
+    def build_decoder(arch='ppm_deepsup',
+                      fc_dim=512,
+                      num_class=NUM_CLASS,
+                      weights='',
+                      use_softmax=False,
+                      drop_last_conv=False):
+        arch = arch.lower()
+        if arch == 'ppm_deepsup':
+            net_decoder = PPMDeepsup(
+                num_class=num_class,
+                fc_dim=fc_dim,
+                use_softmax=use_softmax,
+                drop_last_conv=drop_last_conv)
+        elif arch == 'c1_deepsup':
+            net_decoder = C1DeepSup(
+                num_class=num_class,
+                fc_dim=fc_dim,
+                use_softmax=use_softmax,
+                drop_last_conv=drop_last_conv)
+        else:
+            raise Exception('Architecture undefined!')
+
+        net_decoder.apply(ModelBuilder.weights_init)
+        if len(weights) > 0:
+            print('Loading weights for net_decoder')
+            net_decoder.load_state_dict(
+                torch.load(weights, map_location=lambda storage, loc: storage),
+                strict=False)
+        return net_decoder
+
+    @staticmethod
+    def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim,
+                    drop_last_conv, *arts, **kwargs):
+        path = os.path.join(
+            weights_path, 'ade20k',
+            f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth')
+        return ModelBuilder.build_decoder(
+            arch=arch_decoder,
+            fc_dim=fc_dim,
+            weights=path,
+            use_softmax=True,
+            drop_last_conv=drop_last_conv)
+
+    @staticmethod
+    def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim,
+                    segmentation, *arts, **kwargs):
+        if segmentation:
+            path = os.path.join(
+                weights_path, 'ade20k',
+                f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth')
+        else:
+            path = ''
+        return ModelBuilder.build_encoder(
+            arch=arch_encoder,
+            fc_dim=fc_dim,
+            weights=path,
+            model_dir=weights_path)
+
+
+def conv3x3_bn_relu(in_planes, out_planes, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False),
+        BatchNorm2d(out_planes),
+        nn.ReLU(inplace=True),
+    )
+
+
+# pyramid pooling, deep supervision
+class PPMDeepsup(nn.Module):
+
+    def __init__(self,
+                 num_class=NUM_CLASS,
+                 fc_dim=4096,
+                 use_softmax=False,
+                 pool_scales=(1, 2, 3, 6),
+                 drop_last_conv=False):
+        super().__init__()
+        self.use_softmax = use_softmax
+        self.drop_last_conv = drop_last_conv
+
+        self.ppm = []
+        for scale in pool_scales:
+            self.ppm.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(scale),
+                    nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
+                    BatchNorm2d(512), nn.ReLU(inplace=True)))
+        self.ppm = nn.ModuleList(self.ppm)
+        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
+
+        self.conv_last = nn.Sequential(
+            nn.Conv2d(
+                fc_dim + len(pool_scales) * 512,
+                512,
+                kernel_size=3,
+                padding=1,
+                bias=False), BatchNorm2d(512), nn.ReLU(inplace=True),
+            nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1))
+        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+        self.dropout_deepsup = nn.Dropout2d(0.1)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+
+        input_size = conv5.size()
+        ppm_out = [conv5]
+        for pool_scale in self.ppm:
+            ppm_out.append(
+                nn.functional.interpolate(
+                    pool_scale(conv5), (input_size[2], input_size[3]),
+                    mode='bilinear',
+                    align_corners=False))
+        ppm_out = torch.cat(ppm_out, 1)
+
+        if self.drop_last_conv:
+            return ppm_out
+        else:
+            x = self.conv_last(ppm_out)
+
+            if self.use_softmax:  # is True during inference
+                x = nn.functional.interpolate(
+                    x, size=segSize, mode='bilinear', align_corners=False)
+                x = nn.functional.softmax(x, dim=1)
+                return x
+
+            # deep sup
+            conv4 = conv_out[-2]
+            _ = self.cbr_deepsup(conv4)
+            _ = self.dropout_deepsup(_)
+            _ = self.conv_last_deepsup(_)
+
+            x = nn.functional.log_softmax(x, dim=1)
+            _ = nn.functional.log_softmax(_, dim=1)
+
+            return (x, _)
+
+
+class Resnet(nn.Module):
+
+    def __init__(self, orig_resnet):
+        super(Resnet, self).__init__()
+
+        # take pretrained resnet, except AvgPool and FC
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu1 = orig_resnet.relu1
+        self.conv2 = orig_resnet.conv2
+        self.bn2 = orig_resnet.bn2
+        self.relu2 = orig_resnet.relu2
+        self.conv3 = orig_resnet.conv3
+        self.bn3 = orig_resnet.bn3
+        self.relu3 = orig_resnet.relu3
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+
+    def forward(self, x, return_feature_maps=False):
+        conv_out = []
+
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        conv_out.append(x)
+        x = self.layer2(x)
+        conv_out.append(x)
+        x = self.layer3(x)
+        conv_out.append(x)
+        x = self.layer4(x)
+        conv_out.append(x)
+
+        if return_feature_maps:
+            return conv_out
+        return [x]
+
+
+# Resnet Dilated
+class ResnetDilated(nn.Module):
+
+    def __init__(self, orig_resnet, dilate_scale=8):
+        super().__init__()
+        from functools import partial
+
+        if dilate_scale == 8:
+            orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
+        elif dilate_scale == 16:
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))
+
+        # take pretrained resnet, except AvgPool and FC
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu1 = orig_resnet.relu1
+        self.conv2 = orig_resnet.conv2
+        self.bn2 = orig_resnet.bn2
+        self.relu2 = orig_resnet.relu2
+        self.conv3 = orig_resnet.conv3
+        self.bn3 = orig_resnet.bn3
+        self.relu3 = orig_resnet.relu3
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+
+    def _nostride_dilate(self, m, dilate):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            # the convolution with stride
+            if m.stride == (2, 2):
+                m.stride = (1, 1)
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate // 2, dilate // 2)
+                    m.padding = (dilate // 2, dilate // 2)
+            # other convoluions
+            else:
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate, dilate)
+                    m.padding = (dilate, dilate)
+
+    def forward(self, x, return_feature_maps=False):
+        conv_out = []
+
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        conv_out.append(x)
+        x = self.layer2(x)
+        conv_out.append(x)
+        x = self.layer3(x)
+        conv_out.append(x)
+        x = self.layer4(x)
+        conv_out.append(x)
+
+        if return_feature_maps:
+            return conv_out
+        return [x]
+
+
+# last conv, deep supervision
+class C1DeepSup(nn.Module):
+
+    def __init__(self,
+                 num_class=150,
+                 fc_dim=2048,
+                 use_softmax=False,
+                 drop_last_conv=False):
+        super(C1DeepSup, self).__init__()
+        self.use_softmax = use_softmax
+        self.drop_last_conv = drop_last_conv
+
+        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
+        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
+
+        # last conv
+        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+
+        x = self.cbr(conv5)
+
+        if self.drop_last_conv:
+            return x
+        else:
+            x = self.conv_last(x)
+
+            if self.use_softmax:  # is True during inference
+                x = nn.functional.interpolate(
+                    x, size=segSize, mode='bilinear', align_corners=False)
+                x = nn.functional.softmax(x, dim=1)
+                return x
+
+            # deep sup
+            conv4 = conv_out[-2]
+            _ = self.cbr_deepsup(conv4)
+            _ = self.conv_last_deepsup(_)
+
+            x = nn.functional.log_softmax(x, dim=1)
+            _ = nn.functional.log_softmax(_, dim=1)
+
+            return (x, _)
+
+
+# last conv
+class C1(nn.Module):
+
+    def __init__(self, num_class=150, fc_dim=2048, use_softmax=False):
+        super(C1, self).__init__()
+        self.use_softmax = use_softmax
+
+        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
+
+        # last conv
+        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+        x = self.cbr(conv5)
+        x = self.conv_last(x)
+
+        if self.use_softmax:  # is True during inference
+            x = nn.functional.interpolate(
+                x, size=segSize, mode='bilinear', align_corners=False)
+            x = nn.functional.softmax(x, dim=1)
+        else:
+            x = nn.functional.log_softmax(x, dim=1)
+
+        return x
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
new file mode 100644
index 00000000..7da9ff07
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
@@ -0,0 +1,183 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import math
+import os
+
+import torch
+import torch.nn as nn
+from torch.nn import BatchNorm2d
+
+__all__ = ['ResNet', 'resnet50']
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    '3x3 convolution with padding'
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet50(pretrained=False, model_dir='', **kwargs):
+    """Constructs a ResNet-50 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth')
+        model.load_state_dict(
+            torch.load(cached_file, map_location='cpu'), strict=False)
+    return model
diff --git a/modelscope/models/cv/image_inpainting/modules/adversarial.py b/modelscope/models/cv/image_inpainting/modules/adversarial.py
new file mode 100644
index 00000000..b183876b
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/adversarial.py
@@ -0,0 +1,167 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BaseAdversarialLoss:
+
+    def pre_generator_step(self, real_batch: torch.Tensor,
+                           fake_batch: torch.Tensor, generator: nn.Module,
+                           discriminator: nn.Module):
+        """
+        Prepare for generator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+
+    def pre_discriminator_step(self, real_batch: torch.Tensor,
+                               fake_batch: torch.Tensor, generator: nn.Module,
+                               discriminator: nn.Module):
+        """
+        Prepare for discriminator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate generator loss
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total generator loss along with some values that might be interesting to log
+        """
+        raise NotImplementedError
+
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate discriminator loss and call .backward() on it
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total discriminator loss along with some values that might be interesting to log
+        """
+        raise NotImplementedError
+
+    def interpolate_mask(self, mask, shape):
+        assert mask is not None
+        assert self.allow_scale_mask or shape == mask.shape[-2:]
+        if shape != mask.shape[-2:] and self.allow_scale_mask:
+            if self.mask_scale_mode == 'maxpool':
+                mask = F.adaptive_max_pool2d(mask, shape)
+            else:
+                mask = F.interpolate(
+                    mask, size=shape, mode=self.mask_scale_mode)
+        return mask
+
+
+def make_r1_gp(discr_real_pred, real_batch):
+    if torch.is_grad_enabled():
+        grad_real = torch.autograd.grad(
+            outputs=discr_real_pred.sum(),
+            inputs=real_batch,
+            create_graph=True)[0]
+        grad_penalty = (grad_real.view(grad_real.shape[0],
+                                       -1).norm(2, dim=1)**2).mean()
+    else:
+        grad_penalty = 0
+    real_batch.requires_grad = False
+
+    return grad_penalty
+
+
+class NonSaturatingWithR1(BaseAdversarialLoss):
+
+    def __init__(self,
+                 gp_coef=5,
+                 weight=1,
+                 mask_as_fake_target=False,
+                 allow_scale_mask=False,
+                 mask_scale_mode='nearest',
+                 extra_mask_weight_for_gen=0,
+                 use_unmasked_for_gen=True,
+                 use_unmasked_for_discr=True):
+        self.gp_coef = gp_coef
+        self.weight = weight
+        # use for discr => use for gen;
+        # otherwise we teach only the discr to pay attention to very small difference
+        assert use_unmasked_for_gen or (not use_unmasked_for_discr)
+        # mask as target => use unmasked for discr:
+        # if we don't care about unmasked regions at all
+        # then it doesn't matter if the value of mask_as_fake_target is true or false
+        assert use_unmasked_for_discr or (not mask_as_fake_target)
+        self.use_unmasked_for_gen = use_unmasked_for_gen
+        self.use_unmasked_for_discr = use_unmasked_for_discr
+        self.mask_as_fake_target = mask_as_fake_target
+        self.allow_scale_mask = allow_scale_mask
+        self.mask_scale_mode = mask_scale_mode
+        self.extra_mask_weight_for_gen = extra_mask_weight_for_gen
+
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        fake_loss = F.softplus(-discr_fake_pred)
+        if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
+                not self.use_unmasked_for_gen:  # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            if not self.use_unmasked_for_gen:
+                fake_loss = fake_loss * mask
+            else:
+                pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
+                fake_loss = fake_loss * pixel_weights
+
+        return fake_loss.mean() * self.weight, dict()
+
+    def pre_discriminator_step(self, real_batch: torch.Tensor,
+                               fake_batch: torch.Tensor, generator: nn.Module,
+                               discriminator: nn.Module):
+        real_batch.requires_grad = True
+
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+
+        real_loss = F.softplus(-discr_real_pred)
+        grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
+        fake_loss = F.softplus(discr_fake_pred)
+
+        if not self.use_unmasked_for_discr or self.mask_as_fake_target:
+            # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            # use_unmasked_for_discr=False only makes sense for fakes;
+            # for reals there is no difference beetween two regions
+            fake_loss = fake_loss * mask
+            if self.mask_as_fake_target:
+                fake_loss = fake_loss + (1
+                                         - mask) * F.softplus(-discr_fake_pred)
+
+        sum_discr_loss = real_loss + grad_penalty + fake_loss
+        metrics = dict(
+            discr_real_out=discr_real_pred.mean(),
+            discr_fake_out=discr_fake_pred.mean(),
+            discr_real_gp=grad_penalty)
+        return sum_discr_loss.mean(), metrics
diff --git a/modelscope/models/cv/image_inpainting/modules/feature_matching.py b/modelscope/models/cv/image_inpainting/modules/feature_matching.py
new file mode 100644
index 00000000..c2effb20
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/feature_matching.py
@@ -0,0 +1,45 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import List
+
+import torch
+import torch.nn.functional as F
+
+
+def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l2).mean()
+
+
+def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l1).mean()
+
+
+def feature_matching_loss(fake_features: List[torch.Tensor],
+                          target_features: List[torch.Tensor],
+                          mask=None):
+    if mask is None:
+        res = torch.stack([
+            F.mse_loss(fake_feat, target_feat)
+            for fake_feat, target_feat in zip(fake_features, target_features)
+        ]).mean()
+    else:
+        res = 0
+        norm = 0
+        for fake_feat, target_feat in zip(fake_features, target_features):
+            cur_mask = F.interpolate(
+                mask,
+                size=fake_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            error_weights = 1 - cur_mask
+            cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
+            res = res + cur_val
+            norm += 1
+        res = res / norm
+    return res
diff --git a/modelscope/models/cv/image_inpainting/modules/ffc.py b/modelscope/models/cv/image_inpainting/modules/ffc.py
new file mode 100644
index 00000000..c74425e3
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ffc.py
@@ -0,0 +1,588 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.geometry.transform import rotate
+
+
+def get_activation(kind='tanh'):
+    if kind == 'tanh':
+        return nn.Tanh()
+    if kind == 'sigmoid':
+        return nn.Sigmoid()
+    if kind is False:
+        return nn.Identity()
+    raise ValueError(f'Unknown activation kind {kind}')
+
+
+class SELayer(nn.Module):
+
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        res = x * y.expand_as(x)
+        return res
+
+
+class FourierUnit(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=1,
+                 spatial_scale_factor=None,
+                 spatial_scale_mode='bilinear',
+                 spectral_pos_encoding=False,
+                 use_se=False,
+                 se_kwargs=None,
+                 ffc3d=False,
+                 fft_norm='ortho'):
+        # bn_layer not used
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+
+        self.conv_layer = torch.nn.Conv2d(
+            in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
+            out_channels=out_channels * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=self.groups,
+            bias=False)
+        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+
+        # squeeze and excitation block
+        self.use_se = use_se
+        if use_se:
+            if se_kwargs is None:
+                se_kwargs = {}
+            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
+
+        self.spatial_scale_factor = spatial_scale_factor
+        self.spatial_scale_mode = spatial_scale_mode
+        self.spectral_pos_encoding = spectral_pos_encoding
+        self.ffc3d = ffc3d
+        self.fft_norm = fft_norm
+
+    def forward(self, x):
+        batch = x.shape[0]
+
+        if self.spatial_scale_factor is not None:
+            orig_size = x.shape[-2:]
+            x = F.interpolate(
+                x,
+                scale_factor=self.spatial_scale_factor,
+                mode=self.spatial_scale_mode,
+                align_corners=False)
+
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
+        ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2,
+                              3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((
+            batch,
+            -1,
+        ) + ffted.size()[3:])
+
+        if self.spectral_pos_encoding:
+            height, width = ffted.shape[-2:]
+            coords_vert = torch.linspace(0, 1,
+                                         height)[None, None, :, None].expand(
+                                             batch, 1, height, width).to(ffted)
+            coords_hor = torch.linspace(0, 1,
+                                        width)[None, None, None, :].expand(
+                                            batch, 1, height, width).to(ffted)
+            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
+
+        if self.use_se:
+            ffted = self.se(ffted)
+
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(self.bn(ffted))
+
+        ffted = ffted.view((
+            batch,
+            -1,
+            2,
+        ) + ffted.size()[2:]).permute(
+            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+
+        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
+        output = torch.fft.irfftn(
+            ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
+
+        if self.spatial_scale_factor is not None:
+            output = F.interpolate(
+                output,
+                size=orig_size,
+                mode=self.spatial_scale_mode,
+                align_corners=False)
+
+        return output
+
+
+class SpectralTransform(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 groups=1,
+                 enable_lfu=True,
+                 **fu_kwargs):
+        # bn_layer not used
+        super(SpectralTransform, self).__init__()
+        self.enable_lfu = enable_lfu
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+
+        self.stride = stride
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels // 2,
+                kernel_size=1,
+                groups=groups,
+                bias=False), nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True))
+        self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups,
+                              **fu_kwargs)
+        if self.enable_lfu:
+            self.lfu = FourierUnit(out_channels // 2, out_channels // 2,
+                                   groups)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            groups=groups,
+            bias=False)
+
+    def forward(self, x):
+
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+
+        if self.enable_lfu:
+            n, c, h, w = x.shape
+            split_no = 2
+            split_s = h // split_no
+            xs = torch.cat(
+                torch.split(x[:, :c // 4], split_s, dim=-2),
+                dim=1).contiguous()
+            xs = torch.cat(
+                torch.split(xs, split_s, dim=-1), dim=1).contiguous()
+            xs = self.lfu(xs)
+            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
+        else:
+            xs = 0
+
+        output = self.conv2(x + output + xs)
+
+        return output
+
+
+class LearnableSpatialTransformWrapper(nn.Module):
+
+    def __init__(self,
+                 impl,
+                 pad_coef=0.5,
+                 angle_init_range=80,
+                 train_angle=True):
+        super().__init__()
+        self.impl = impl
+        self.angle = torch.rand(1) * angle_init_range
+        if train_angle:
+            self.angle = nn.Parameter(self.angle, requires_grad=True)
+        self.pad_coef = pad_coef
+
+    def forward(self, x):
+        if torch.is_tensor(x):
+            return self.inverse_transform(self.impl(self.transform(x)), x)
+        elif isinstance(x, tuple):
+            x_trans = tuple(self.transform(elem) for elem in x)
+            y_trans = self.impl(x_trans)
+            return tuple(
+                self.inverse_transform(elem, orig_x)
+                for elem, orig_x in zip(y_trans, x))
+        else:
+            raise ValueError(f'Unexpected input type {type(x)}')
+
+    def transform(self, x):
+        height, width = x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
+        x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
+        return x_padded_rotated
+
+    def inverse_transform(self, y_padded_rotated, orig_x):
+        height, width = orig_x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+
+        y_padded = rotate(
+            y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
+        y_height, y_width = y_padded.shape[2:]
+        y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w]
+        return y
+
+
+class FFC(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 ratio_gin,
+                 ratio_gout,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 enable_lfu=True,
+                 padding_type='reflect',
+                 gated=False,
+                 **spectral_kwargs):
+        super(FFC, self).__init__()
+
+        assert stride == 1 or stride == 2, 'Stride should be 1 or 2.'
+        self.stride = stride
+
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(
+            in_cl,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(
+            in_cl,
+            out_cg,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(
+            in_cg,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(in_cg, out_cg, stride,
+                              1 if groups == 1 else groups // 2, enable_lfu,
+                              **spectral_kwargs)
+
+        self.gated = gated
+        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        self.gate = module(in_channels, 2, 1)
+
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
+
+        return out_xl, out_xg
+
+
+class FFC_BN_ACT(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 ratio_gin,
+                 ratio_gout,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 norm_layer=nn.BatchNorm2d,
+                 activation_layer=nn.Identity,
+                 padding_type='reflect',
+                 enable_lfu=True,
+                 **kwargs):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(
+            in_channels,
+            out_channels,
+            kernel_size,
+            ratio_gin,
+            ratio_gout,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            enable_lfu,
+            padding_type=padding_type,
+            **kwargs)
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        self.bn_l = lnorm(out_channels - global_channels)
+        self.bn_g = gnorm(global_channels)
+
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+
+    def forward(self, x):
+        x_l, x_g = self.ffc(x)
+        x_l = self.act_l(self.bn_l(x_l))
+        x_g = self.act_g(self.bn_g(x_g))
+        return x_l, x_g
+
+
+class FFCResnetBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 padding_type,
+                 norm_layer,
+                 activation_layer=nn.ReLU,
+                 dilation=1,
+                 spatial_transform_kwargs=None,
+                 inline=False,
+                 **conv_kwargs):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            **conv_kwargs)
+        self.conv2 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            **conv_kwargs)
+        if spatial_transform_kwargs is not None:
+            self.conv1 = LearnableSpatialTransformWrapper(
+                self.conv1, **spatial_transform_kwargs)
+            self.conv2 = LearnableSpatialTransformWrapper(
+                self.conv2, **spatial_transform_kwargs)
+        self.inline = inline
+
+    def forward(self, x):
+        if self.inline:
+            x_l, x_g = x[:, :-self.conv1.ffc.
+                         global_in_num], x[:, -self.conv1.ffc.global_in_num:]
+        else:
+            x_l, x_g = x if type(x) is tuple else (x, 0)
+
+        id_l, id_g = x_l, x_g
+
+        x_l, x_g = self.conv1((x_l, x_g))
+        x_l, x_g = self.conv2((x_l, x_g))
+
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        if self.inline:
+            out = torch.cat(out, dim=1)
+        return out
+
+
+class ConcatTupleLayer(nn.Module):
+
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+
+
+class FFCResNetGenerator(nn.Module):
+
+    def __init__(self,
+                 input_nc=4,
+                 output_nc=3,
+                 ngf=64,
+                 n_downsampling=3,
+                 n_blocks=18,
+                 norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect',
+                 activation_layer=nn.ReLU,
+                 up_norm_layer=nn.BatchNorm2d,
+                 up_activation=nn.ReLU(True),
+                 init_conv_kwargs={
+                     'ratio_gin': 0,
+                     'ratio_gout': 0,
+                     'enable_lfu': False
+                 },
+                 downsample_conv_kwargs={
+                     'ratio_gin': 0,
+                     'ratio_gout': 0,
+                     'enable_lfu': False
+                 },
+                 resnet_conv_kwargs={
+                     'ratio_gin': 0.75,
+                     'ratio_gout': 0.75,
+                     'enable_lfu': False
+                 },
+                 spatial_transform_layers=None,
+                 spatial_transform_kwargs={},
+                 add_out_act='sigmoid',
+                 max_features=1024,
+                 out_ffc=False,
+                 out_ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+
+        model = [
+            nn.ReflectionPad2d(3),
+            FFC_BN_ACT(
+                input_nc,
+                ngf,
+                kernel_size=7,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+                **init_conv_kwargs)
+        ]
+
+        # downsample
+        for i in range(n_downsampling):
+            mult = 2**i
+            if i == n_downsampling - 1:
+                cur_conv_kwargs = dict(downsample_conv_kwargs)
+                cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get(
+                    'ratio_gin', 0)
+            else:
+                cur_conv_kwargs = downsample_conv_kwargs
+            model += [
+                FFC_BN_ACT(
+                    min(max_features, ngf * mult),
+                    min(max_features, ngf * mult * 2),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    **cur_conv_kwargs)
+            ]
+
+        mult = 2**n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+
+        # resnet blocks
+        for i in range(n_blocks):
+            cur_resblock = FFCResnetBlock(
+                feats_num_bottleneck,
+                padding_type=padding_type,
+                activation_layer=activation_layer,
+                norm_layer=norm_layer,
+                **resnet_conv_kwargs)
+            if spatial_transform_layers is not None and i in spatial_transform_layers:
+                cur_resblock = LearnableSpatialTransformWrapper(
+                    cur_resblock, **spatial_transform_kwargs)
+            model += [cur_resblock]
+
+        model += [ConcatTupleLayer()]
+
+        # upsample
+        for i in range(n_downsampling):
+            mult = 2**(n_downsampling - i)
+            model += [
+                nn.ConvTranspose2d(
+                    min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    output_padding=1),
+                up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                up_activation
+            ]
+
+        if out_ffc:
+            model += [
+                FFCResnetBlock(
+                    ngf,
+                    padding_type=padding_type,
+                    activation_layer=activation_layer,
+                    norm_layer=norm_layer,
+                    inline=True,
+                    **out_ffc_kwargs)
+            ]
+
+        model += [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)
+        ]
+        if add_out_act:
+            model.append(
+                get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+
+    def forward(self, input):
+        return self.model(input)
diff --git a/modelscope/models/cv/image_inpainting/modules/inception.py b/modelscope/models/cv/image_inpainting/modules/inception.py
new file mode 100644
index 00000000..5070533d
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/inception.py
@@ -0,0 +1,324 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+
+from modelscope.utils.logger import get_logger
+
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \
+                  'fid_weights/pt_inception-2015-12-05-6726825d.pth'
+
+LOGGER = get_logger()
+
+
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+
+        self.blocks = nn.ModuleList()
+
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = models.inception_v3(pretrained=True)
+
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+
+    def forward(self, inp):
+        """Get Inception feature maps
+
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+
+        if self.resize_input:
+            x = F.interpolate(
+                x, size=(299, 299), mode='bilinear', align_corners=False)
+
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+
+            if idx == self.last_needed_block:
+                break
+
+        return outp
+
+
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    LOGGER.info('fid_inception_v3 called')
+    inception = models.inception_v3(
+        num_classes=1008, aux_logits=False, pretrained=False)
+    LOGGER.info('models.inception_v3 done')
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+
+    LOGGER.info('fid_inception_v3 patching done')
+
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    LOGGER.info('fid_inception_v3 weights downloaded')
+
+    inception.load_state_dict(state_dict)
+    LOGGER.info('fid_inception_v3 weights loaded into model')
+
+    return inception
+
+
+class FIDInceptionA(models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionC(models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_1(models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_2(models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
diff --git a/modelscope/models/cv/image_inpainting/modules/perceptual.py b/modelscope/models/cv/image_inpainting/modules/perceptual.py
new file mode 100644
index 00000000..80fe2b96
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/perceptual.py
@@ -0,0 +1,47 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from .ade20k import ModelBuilder
+
+IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
+IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]
+
+
+class ResNetPL(nn.Module):
+
+    def __init__(self,
+                 weight=1,
+                 weights_path=None,
+                 arch_encoder='resnet50dilated',
+                 segmentation=True):
+        super().__init__()
+        self.impl = ModelBuilder.get_encoder(
+            weights_path=weights_path,
+            arch_encoder=arch_encoder,
+            arch_decoder='ppm_deepsup',
+            fc_dim=2048,
+            segmentation=segmentation)
+        self.impl.eval()
+        for w in self.impl.parameters():
+            w.requires_grad_(False)
+
+        self.weight = weight
+
+    def forward(self, pred, target):
+        pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
+        target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)
+
+        pred_feats = self.impl(pred, return_feature_maps=True)
+        target_feats = self.impl(target, return_feature_maps=True)
+
+        result = torch.stack([
+            F.mse_loss(cur_pred, cur_target)
+            for cur_pred, cur_target in zip(pred_feats, target_feats)
+        ]).sum() * self.weight
+        return result
diff --git a/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
new file mode 100644
index 00000000..32e18f3e
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
@@ -0,0 +1,75 @@
+"""
+The implementation is adopted from
+https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
+"""
+import collections
+import functools
+import logging
+from collections import defaultdict
+from functools import partial
+
+import numpy as np
+import torch.nn as nn
+
+
+# Defines the PatchGAN discriminator with the specified arguments.
+class NLayerDiscriminator(nn.Module):
+
+    def __init__(
+        self,
+        input_nc=3,
+        ndf=64,
+        n_layers=4,
+        norm_layer=nn.BatchNorm2d,
+    ):
+        super().__init__()
+        self.n_layers = n_layers
+
+        kw = 4
+        padw = int(np.ceil((kw - 1.0) / 2))
+        sequence = [[
+            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.LeakyReLU(0.2, True)
+        ]]
+
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+
+            cur_model = []
+            cur_model += [
+                nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+
+        sequence += [[
+            nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)
+        ]]
+
+        for n in range(len(sequence)):
+            setattr(self, 'model' + str(n), nn.Sequential(*sequence[n]))
+
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
diff --git a/modelscope/models/cv/image_inpainting/refinement.py b/modelscope/models/cv/image_inpainting/refinement.py
new file mode 100644
index 00000000..662d8a05
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/refinement.py
@@ -0,0 +1,393 @@
+'''
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+'''
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from kornia.filters import gaussian_blur2d
+from kornia.geometry.transform import resize
+from kornia.morphology import erosion
+from torch.nn import functional as F
+from torch.optim import SGD, Adam
+from tqdm import tqdm
+
+from .modules.ffc import FFCResnetBlock
+
+
+def move_to_device(obj, device):
+    if isinstance(obj, nn.Module):
+        return obj.to(device)
+    if torch.is_tensor(obj):
+        return obj.to(device)
+    if isinstance(obj, (tuple, list)):
+        return [move_to_device(el, device) for el in obj]
+    if isinstance(obj, dict):
+        return {name: move_to_device(val, device) for name, val in obj.items()}
+    raise ValueError(f'Unexpected type {type(obj)}')
+
+
+def ceil_modulo(x, mod):
+    if x % mod == 0:
+        return x
+    return (x // mod + 1) * mod
+
+
+def pad_tensor_to_modulo(img, mod):
+    batch_size, channels, height, width = img.shape
+    out_height = ceil_modulo(height, mod)
+    out_width = ceil_modulo(width, mod)
+    return F.pad(
+        img,
+        pad=(0, out_width - width, 0, out_height - height),
+        mode='reflect')
+
+
+def _pyrdown(im: torch.Tensor, downsize: tuple = None):
+    """downscale the image"""
+    if downsize is None:
+        downsize = (im.shape[2] // 2, im.shape[3] // 2)
+    assert im.shape[
+        1] == 3, 'Expected shape for the input to be (n,3,height,width)'
+    im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0))
+    im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False)
+    return im
+
+
+def _pyrdown_mask(mask: torch.Tensor,
+                  downsize: tuple = None,
+                  eps: float = 1e-8,
+                  blur_mask: bool = True,
+                  round_up: bool = True):
+    """downscale the mask tensor
+
+    Parameters
+    ----------
+    mask : torch.Tensor
+        mask of size (B, 1, H, W)
+    downsize : tuple, optional
+        size to downscale to. If None, image is downscaled to half, by default None
+    eps : float, optional
+        threshold value for binarizing the mask, by default 1e-8
+    blur_mask : bool, optional
+        if True, apply gaussian filter before downscaling, by default True
+    round_up : bool, optional
+        if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True
+
+    Returns
+    -------
+    torch.Tensor
+        downscaled mask
+    """
+
+    if downsize is None:
+        downsize = (mask.shape[2] // 2, mask.shape[3] // 2)
+    assert mask.shape[
+        1] == 1, 'Expected shape for the input to be (n,1,height,width)'
+    if blur_mask is True:
+        mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0))
+        mask = F.interpolate(
+            mask, size=downsize, mode='bilinear', align_corners=False)
+    else:
+        mask = F.interpolate(
+            mask, size=downsize, mode='bilinear', align_corners=False)
+    if round_up:
+        mask[mask >= eps] = 1
+        mask[mask < eps] = 0
+    else:
+        mask[mask >= 1.0 - eps] = 1
+        mask[mask < 1.0 - eps] = 0
+    return mask
+
+
+def _erode_mask(mask: torch.Tensor,
+                ekernel: torch.Tensor = None,
+                eps: float = 1e-8):
+    """erode the mask, and set gray pixels to 0"""
+    if ekernel is not None:
+        mask = erosion(mask, ekernel)
+        mask[mask >= 1.0 - eps] = 1
+        mask[mask < 1.0 - eps] = 0
+    return mask
+
+
+def _l1_loss(pred: torch.Tensor,
+             pred_downscaled: torch.Tensor,
+             ref: torch.Tensor,
+             mask: torch.Tensor,
+             mask_downscaled: torch.Tensor,
+             image: torch.Tensor,
+             on_pred: bool = True):
+    """l1 loss on src pixels, and downscaled predictions if on_pred=True"""
+    loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8]))
+    if on_pred:
+        loss += torch.mean(
+            torch.abs(pred_downscaled[mask_downscaled >= 1e-8]
+                      - ref[mask_downscaled >= 1e-8]))
+    return loss
+
+
+def _infer(image: torch.Tensor,
+           mask: torch.Tensor,
+           forward_front: nn.Module,
+           forward_rears: nn.Module,
+           ref_lower_res: torch.Tensor,
+           orig_shape: tuple,
+           devices: list,
+           scale_ind: int,
+           n_iters: int = 15,
+           lr: float = 0.002):
+    """Performs inference with refinement at a given scale.
+
+    Parameters
+    ----------
+    image : torch.Tensor
+        input image to be inpainted, of size (1,3,H,W)
+    mask : torch.Tensor
+        input inpainting mask, of size (1,1,H,W)
+    forward_front : nn.Module
+        the front part of the inpainting network
+    forward_rears : nn.Module
+        the rear part of the inpainting network
+    ref_lower_res : torch.Tensor
+        the inpainting at previous scale, used as reference image
+    orig_shape : tuple
+        shape of the original input image before padding
+    devices : list
+        list of available devices
+    scale_ind : int
+        the scale index
+    n_iters : int, optional
+        number of iterations of refinement, by default 15
+    lr : float, optional
+        learning rate, by default 0.002
+
+    Returns
+    -------
+    torch.Tensor
+        inpainted image
+    """
+    masked_image = image * (1 - mask)
+    masked_image = torch.cat([masked_image, mask], dim=1)
+
+    mask = mask.repeat(1, 3, 1, 1)
+    if ref_lower_res is not None:
+        ref_lower_res = ref_lower_res.detach()
+    with torch.no_grad():
+        z1, z2 = forward_front(masked_image)
+    # Inference
+    mask = mask.to(devices[-1])
+    ekernel = torch.from_numpy(
+        cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                  (15, 15)).astype(bool)).float()
+    ekernel = ekernel.to(devices[-1])
+    image = image.to(devices[-1])
+    z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0])
+    z1.requires_grad, z2.requires_grad = True, True
+
+    optimizer = Adam([z1, z2], lr=lr)
+
+    pbar = tqdm(range(n_iters), leave=False)
+    for idi in pbar:
+        optimizer.zero_grad()
+        input_feat = (z1, z2)
+        for idd, forward_rear in enumerate(forward_rears):
+            output_feat = forward_rear(input_feat)
+            if idd < len(devices) - 1:
+                midz1, midz2 = output_feat
+                midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to(
+                    devices[idd + 1])
+                input_feat = (midz1, midz2)
+            else:
+                pred = output_feat
+
+        if ref_lower_res is None:
+            break
+        losses = {}
+        # scaled loss with downsampler
+        pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]])
+        mask_downscaled = _pyrdown_mask(
+            mask[:, :1, :orig_shape[0], :orig_shape[1]],
+            blur_mask=False,
+            round_up=False)
+        mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel)
+        mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1)
+        losses['ms_l1'] = _l1_loss(
+            pred,
+            pred_downscaled,
+            ref_lower_res,
+            mask,
+            mask_downscaled,
+            image,
+            on_pred=True)
+
+        loss = sum(losses.values())
+        pbar.set_description(
+            'Refining scale {} using scale {} ...current loss: {:.4f}'.format(
+                scale_ind + 1, scale_ind, loss.item()))
+        if idi < n_iters - 1:
+            loss.backward()
+            optimizer.step()
+            del pred_downscaled
+            del loss
+            del pred
+    # "pred" is the prediction after Plug-n-Play module
+    inpainted = mask * pred + (1 - mask) * image
+    inpainted = inpainted.detach().cpu()
+    return inpainted
+
+
+def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int,
+                            px_budget: int):
+    """Build the image mask pyramid
+
+    Parameters
+    ----------
+    batch : dict
+        batch containing image, mask, etc
+    min_side : int
+        minimum side length to limit the number of scales of the pyramid
+    max_scales : int
+        maximum number of scales allowed
+    px_budget : int
+        the product H*W cannot exceed this budget, because of resource constraints
+
+    Returns
+    -------
+    tuple
+        image-mask pyramid in the form of list of images and list of masks
+    """
+
+    assert batch['image'].shape[
+        0] == 1, 'refiner works on only batches of size 1!'
+
+    h, w = batch['unpad_to_size']
+    h, w = h[0].item(), w[0].item()
+
+    image = batch['image'][..., :h, :w]
+    mask = batch['mask'][..., :h, :w]
+    if h * w > px_budget:
+        # resize
+        ratio = np.sqrt(px_budget / float(h * w))
+        h_orig, w_orig = h, w
+        h, w = int(h * ratio), int(w * ratio)
+        print(
+            f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...'
+        )
+        image = resize(
+            image, (h, w), interpolation='bilinear', align_corners=False)
+        mask = resize(
+            mask, (h, w), interpolation='bilinear', align_corners=False)
+        mask[mask > 1e-8] = 1
+    breadth = min(h, w)
+    n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))),
+                   max_scales)
+    ls_images = []
+    ls_masks = []
+
+    ls_images.append(image)
+    ls_masks.append(mask)
+
+    for _ in range(n_scales - 1):
+        image_p = _pyrdown(ls_images[-1])
+        mask_p = _pyrdown_mask(ls_masks[-1])
+        ls_images.append(image_p)
+        ls_masks.append(mask_p)
+    # reverse the lists because we want the lowest resolution image as index 0
+    return ls_images[::-1], ls_masks[::-1]
+
+
+def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str,
+                   modulo: int, n_iters: int, lr: float, min_side: int,
+                   max_scales: int, px_budget: int):
+    """Refines the inpainting of the network
+
+    Parameters
+    ----------
+    batch : dict
+        image-mask batch, currently we assume the batchsize to be 1
+    inpainter : nn.Module
+        the inpainting neural network
+    gpu_ids : str
+        the GPU ids of the machine to use. If only single GPU, use: "0,"
+    modulo : int
+        pad the image to ensure dimension % modulo == 0
+    n_iters : int
+        number of iterations of refinement for each scale
+    lr : float
+        learning rate
+    min_side : int
+        all sides of image on all scales should be >= min_side / sqrt(2)
+    max_scales : int
+        max number of downscaling scales for the image-mask pyramid
+    px_budget : int
+        pixels budget. Any image will be resized to satisfy height*width <= px_budget
+
+    Returns
+    -------
+    torch.Tensor
+        inpainted image of size (1,3,H,W)
+    """
+    inpainter = inpainter.model
+    assert not inpainter.training
+    assert not inpainter.add_noise_kwargs
+    assert inpainter.concat_mask
+
+    gpu_ids = [
+        f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',')
+        if gpuid.isdigit()
+    ]
+    n_resnet_blocks = 0
+    first_resblock_ind = 0
+    found_first_resblock = False
+    for idl in range(len(inpainter.generator.model)):
+        if isinstance(inpainter.generator.model[idl], FFCResnetBlock):
+            n_resnet_blocks += 1
+            found_first_resblock = True
+        elif not found_first_resblock:
+            first_resblock_ind += 1
+    resblocks_per_gpu = n_resnet_blocks // len(gpu_ids)
+
+    devices = [torch.device(gpu_id) for gpu_id in gpu_ids]
+
+    # split the model into front, and rear parts
+    forward_front = inpainter.generator.model[0:first_resblock_ind]
+    forward_front.to(devices[0])
+    forward_rears = []
+    for idd in range(len(gpu_ids)):
+        if idd < len(gpu_ids) - 1:
+            forward_rears.append(
+                inpainter.generator.model[first_resblock_ind
+                                          + resblocks_per_gpu
+                                          * (idd):first_resblock_ind
+                                          + resblocks_per_gpu * (idd + 1)])
+        else:
+            forward_rears.append(
+                inpainter.generator.model[first_resblock_ind
+                                          + resblocks_per_gpu * (idd):])
+        forward_rears[idd].to(devices[idd])
+
+    ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales,
+                                                  px_budget)
+    image_inpainted = None
+
+    for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)):
+        orig_shape = image.shape[2:]
+        image = pad_tensor_to_modulo(image, modulo)
+        mask = pad_tensor_to_modulo(mask, modulo)
+        mask[mask >= 1e-8] = 1.0
+        mask[mask < 1e-8] = 0.0
+        image, mask = move_to_device(image, devices[0]), move_to_device(
+            mask, devices[0])
+        if image_inpainted is not None:
+            image_inpainted = move_to_device(image_inpainted, devices[-1])
+        image_inpainted = _infer(image, mask, forward_front, forward_rears,
+                                 image_inpainted, orig_shape, devices, ids,
+                                 n_iters, lr)
+        image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]]
+        # detach everything to save resources
+        image = image.detach().cpu()
+        mask = mask.detach().cpu()
+
+    return image_inpainted
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index e2bf5bc1..35c060f0 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
     from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+    from .image_inpainting import ImageInpaintingDataset
     from .passage_ranking_dataset import PassageRankingDataset
 
 else:
@@ -24,6 +25,7 @@ else:
         ['ImageInstanceSegmentationCocoDataset'],
         'video_summarization_dataset': ['VideoSummarizationDataset'],
         'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
+        'image_inpainting': ['ImageInpaintingDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
new file mode 100644
index 00000000..732a1bd7
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .image_inpainting_dataset import ImageInpaintingDataset
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/aug.py b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py
new file mode 100644
index 00000000..445bb9b4
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py
@@ -0,0 +1,100 @@
+"""
+The implementation is borrowed from LaMa,
+publicly available at https://github.com/saic-mdal/lama
+"""
+import imgaug.augmenters as iaa
+from albumentations import DualIAATransform, to_tuple
+
+
+class IAAAffine2(DualIAATransform):
+    """Place a regular grid of points on the input and randomly move the neighbourhood of these point around
+    via affine transformations.
+
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+    Args:
+        p (float): probability of applying the transform. Default: 0.5.
+
+    Targets:
+        image, mask
+    """
+
+    def __init__(
+        self,
+        scale=(0.7, 1.3),
+        translate_percent=None,
+        translate_px=None,
+        rotate=0.0,
+        shear=(-0.1, 0.1),
+        order=1,
+        cval=0,
+        mode='reflect',
+        always_apply=False,
+        p=0.5,
+    ):
+        super(IAAAffine2, self).__init__(always_apply, p)
+        self.scale = dict(x=scale, y=scale)
+        self.translate_percent = to_tuple(translate_percent, 0)
+        self.translate_px = to_tuple(translate_px, 0)
+        self.rotate = to_tuple(rotate)
+        self.shear = dict(x=shear, y=shear)
+        self.order = order
+        self.cval = cval
+        self.mode = mode
+
+    @property
+    def processor(self):
+        return iaa.Affine(
+            self.scale,
+            self.translate_percent,
+            self.translate_px,
+            self.rotate,
+            self.shear,
+            self.order,
+            self.cval,
+            self.mode,
+        )
+
+    def get_transform_init_args_names(self):
+        return ('scale', 'translate_percent', 'translate_px', 'rotate',
+                'shear', 'order', 'cval', 'mode')
+
+
+class IAAPerspective2(DualIAATransform):
+    """Perform a random four point perspective transform of the input.
+
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+    Args:
+        scale ((float, float): standard deviation of the normal distributions. These are used to sample
+            the random distances of the subimage's corners from the full image's corners. Default: (0.05, 0.1).
+        p (float): probability of applying the transform. Default: 0.5.
+
+    Targets:
+        image, mask
+    """
+
+    def __init__(self,
+                 scale=(0.05, 0.1),
+                 keep_size=True,
+                 always_apply=False,
+                 p=0.5,
+                 order=1,
+                 cval=0,
+                 mode='replicate'):
+        super(IAAPerspective2, self).__init__(always_apply, p)
+        self.scale = to_tuple(scale, 1.0)
+        self.keep_size = keep_size
+        self.cval = cval
+        self.mode = mode
+
+    @property
+    def processor(self):
+        return iaa.PerspectiveTransform(
+            self.scale,
+            keep_size=self.keep_size,
+            mode=self.mode,
+            cval=self.cval)
+
+    def get_transform_init_args_names(self):
+        return ('scale', 'keep_size')
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
new file mode 100644
index 00000000..057b8f88
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
@@ -0,0 +1,337 @@
+"""
+Part of the implementation is borrowed and modified from LaMa,
+publicly available at https://github.com/saic-mdal/lama
+"""
+import glob
+import os
+import os.path as osp
+from enum import Enum
+
+import albumentations as A
+import cv2
+import json
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .aug import IAAAffine2, IAAPerspective2
+
+LOGGER = get_logger()
+
+
+class LinearRamp:
+
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+
+
+class DrawMethod(Enum):
+    LINE = 'line'
+    CIRCLE = 'circle'
+    SQUARE = 'square'
+
+
+def make_random_superres_mask(shape,
+                              min_step=2,
+                              max_step=4,
+                              min_width=1,
+                              max_width=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    step_x = np.random.randint(min_step, max_step + 1)
+    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
+    offset_x = np.random.randint(0, step_x)
+
+    step_y = np.random.randint(min_step, max_step + 1)
+    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
+    offset_y = np.random.randint(0, step_y)
+
+    for dy in range(width_y):
+        mask[offset_y + dy::step_y] = 1
+    for dx in range(width_x):
+        mask[:, offset_x + dx::step_x] = 1
+    return mask[None, ...]
+
+
+class RandomSuperresMaskGenerator:
+
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+    def __call__(self, img, iter_i=None):
+        return make_random_superres_mask(img.shape[1:], **self.kwargs)
+
+
+def make_random_rectangle_mask(shape,
+                               margin=10,
+                               bbox_min_size=30,
+                               bbox_max_size=100,
+                               min_times=0,
+                               max_times=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        box_width = np.random.randint(bbox_min_size, bbox_max_size)
+        box_height = np.random.randint(bbox_min_size, bbox_max_size)
+        start_x = np.random.randint(margin, width - margin - box_width + 1)
+        start_y = np.random.randint(margin, height - margin - box_height + 1)
+        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
+    return mask[None, ...]
+
+
+class RandomRectangleMaskGenerator:
+
+    def __init__(self,
+                 margin=10,
+                 bbox_min_size=30,
+                 bbox_max_size=100,
+                 min_times=0,
+                 max_times=3,
+                 ramp_kwargs=None):
+        self.margin = margin
+        self.bbox_min_size = bbox_min_size
+        self.bbox_max_size = bbox_max_size
+        self.min_times = min_times
+        self.max_times = max_times
+        self.ramp = LinearRamp(
+            **ramp_kwargs) if ramp_kwargs is not None else None
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (
+            iter_i is not None) else 1
+        cur_bbox_max_size = int(self.bbox_min_size + 1
+                                + (self.bbox_max_size - self.bbox_min_size)
+                                * coef)
+        cur_max_times = int(self.min_times
+                            + (self.max_times - self.min_times) * coef)
+        return make_random_rectangle_mask(
+            img.shape[1:],
+            margin=self.margin,
+            bbox_min_size=self.bbox_min_size,
+            bbox_max_size=cur_bbox_max_size,
+            min_times=self.min_times,
+            max_times=cur_max_times)
+
+
+def make_random_irregular_mask(shape,
+                               max_angle=4,
+                               max_len=60,
+                               max_width=20,
+                               min_times=0,
+                               max_times=10,
+                               draw_method=DrawMethod.LINE):
+    draw_method = DrawMethod(draw_method)
+
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        start_x = np.random.randint(width)
+        start_y = np.random.randint(height)
+        for j in range(1 + np.random.randint(5)):
+            angle = 0.01 + np.random.randint(max_angle)
+            if i % 2 == 0:
+                angle = 2 * 3.1415926 - angle
+            length = 10 + np.random.randint(max_len)
+            brush_w = 5 + np.random.randint(max_width)
+            end_x = np.clip(
+                (start_x + length * np.sin(angle)).astype(np.int32), 0, width)
+            end_y = np.clip(
+                (start_y + length * np.cos(angle)).astype(np.int32), 0, height)
+            if draw_method == DrawMethod.LINE:
+                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0,
+                         brush_w)
+            elif draw_method == DrawMethod.CIRCLE:
+                cv2.circle(
+                    mask, (start_x, start_y),
+                    radius=brush_w,
+                    color=1.,
+                    thickness=-1)
+            elif draw_method == DrawMethod.SQUARE:
+                radius = brush_w // 2
+                mask[start_y - radius:start_y + radius,
+                     start_x - radius:start_x + radius] = 1
+            start_x, start_y = end_x, end_y
+    return mask[None, ...]
+
+
+class RandomIrregularMaskGenerator:
+
+    def __init__(self,
+                 max_angle=4,
+                 max_len=60,
+                 max_width=20,
+                 min_times=0,
+                 max_times=10,
+                 ramp_kwargs=None,
+                 draw_method=DrawMethod.LINE):
+        self.max_angle = max_angle
+        self.max_len = max_len
+        self.max_width = max_width
+        self.min_times = min_times
+        self.max_times = max_times
+        self.draw_method = draw_method
+        self.ramp = LinearRamp(
+            **ramp_kwargs) if ramp_kwargs is not None else None
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (
+            iter_i is not None) else 1
+        cur_max_len = int(max(1, self.max_len * coef))
+        cur_max_width = int(max(1, self.max_width * coef))
+        cur_max_times = int(self.min_times + 1
+                            + (self.max_times - self.min_times) * coef)
+        return make_random_irregular_mask(
+            img.shape[1:],
+            max_angle=self.max_angle,
+            max_len=cur_max_len,
+            max_width=cur_max_width,
+            min_times=self.min_times,
+            max_times=cur_max_times,
+            draw_method=self.draw_method)
+
+
+class MixedMaskGenerator:
+
+    def __init__(self,
+                 irregular_proba=1 / 3,
+                 irregular_kwargs=None,
+                 box_proba=1 / 3,
+                 box_kwargs=None,
+                 segm_proba=1 / 3,
+                 segm_kwargs=None,
+                 squares_proba=0,
+                 squares_kwargs=None,
+                 superres_proba=0,
+                 superres_kwargs=None,
+                 outpainting_proba=0,
+                 outpainting_kwargs=None,
+                 invert_proba=0):
+        self.probas = []
+        self.gens = []
+
+        if irregular_proba > 0:
+            self.probas.append(irregular_proba)
+            if irregular_kwargs is None:
+                irregular_kwargs = {}
+            else:
+                irregular_kwargs = dict(irregular_kwargs)
+            irregular_kwargs['draw_method'] = DrawMethod.LINE
+            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
+
+        if box_proba > 0:
+            self.probas.append(box_proba)
+            if box_kwargs is None:
+                box_kwargs = {}
+            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
+
+        if squares_proba > 0:
+            self.probas.append(squares_proba)
+            if squares_kwargs is None:
+                squares_kwargs = {}
+            else:
+                squares_kwargs = dict(squares_kwargs)
+            squares_kwargs['draw_method'] = DrawMethod.SQUARE
+            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
+
+        if superres_proba > 0:
+            self.probas.append(superres_proba)
+            if superres_kwargs is None:
+                superres_kwargs = {}
+            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
+
+        self.probas = np.array(self.probas, dtype='float32')
+        self.probas /= self.probas.sum()
+        self.invert_proba = invert_proba
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        kind = np.random.choice(len(self.probas), p=self.probas)
+        gen = self.gens[kind]
+        result = gen(img, iter_i=iter_i, raw_image=raw_image)
+        if self.invert_proba > 0 and random.random() < self.invert_proba:
+            result = 1 - result
+        return result
+
+
+def get_transforms(test_mode, out_size):
+    if not test_mode:
+        transform = A.Compose([
+            IAAPerspective2(scale=(0.0, 0.06)),
+            IAAAffine2(scale=(0.7, 1.3), rotate=(-40, 40), shear=(-0.1, 0.1)),
+            A.PadIfNeeded(min_height=out_size, min_width=out_size),
+            A.OpticalDistortion(),
+            A.RandomCrop(height=out_size, width=out_size),
+            A.HorizontalFlip(),
+            A.CLAHE(),
+            A.RandomBrightnessContrast(
+                brightness_limit=0.2, contrast_limit=0.2),
+            A.HueSaturationValue(
+                hue_shift_limit=5, sat_shift_limit=30, val_shift_limit=5),
+            A.ToFloat()
+        ])
+    else:
+        transform = A.Compose([
+            A.PadIfNeeded(min_height=out_size, min_width=out_size),
+            A.CenterCrop(height=out_size, width=out_size),
+            A.ToFloat()
+        ])
+    return transform
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_inpainting, module_name=Models.image_inpainting)
+class ImageInpaintingDataset(TorchTaskDataset):
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+        LOGGER.info(kwargs)
+        mode = kwargs.get('test_mode', False)
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+        mask_gen_kwargs = kwargs.get('mask_gen_kwargs', {})
+        out_size = kwargs.get('out_size', 256)
+        self.mask_generator = MixedMaskGenerator(**mask_gen_kwargs)
+        self.transform = get_transforms(mode, out_size)
+        self.in_files = sorted(
+            list(
+                glob.glob(
+                    osp.join(self.data_root, '**', '*.jpg'), recursive=True))
+            + list(
+                glob.glob(
+                    osp.join(self.data_root, '**', '*.png'), recursive=True)))
+        self.iter_i = 0
+
+    def __len__(self):
+        return len(self.in_files)
+
+    def __getitem__(self, index):
+        path = self.in_files[index]
+        img = cv2.imread(path)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = self.transform(image=img)['image']
+        img = np.transpose(img, (2, 0, 1))
+        # TODO: maybe generate mask before augmentations? slower, but better for segmentation-based masks
+        mask = self.mask_generator(img, iter_i=self.iter_i)
+        self.iter_i += 1
+        return dict(image=img, mask=mask)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 07a14191..dd59d6fb 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -177,6 +177,7 @@ TASK_OUTPUTS = {
     Tasks.image_denoising: [OutputKeys.OUTPUT_IMG],
     Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
     Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
+    Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG],
 
     # image generation task result for a single image
     # {"output_img": np.array with shape (h, w, 3)}
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index c9a70d14..b18d4465 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -181,6 +181,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
     Tasks.shop_segmentation: (Pipelines.shop_segmentation,
                               'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.image_inpainting: (Pipelines.image_inpainting,
+                             'damo/cv_fft_inpainting_lama'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
     Tasks.hand_static: (Pipelines.hand_static,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 55bad09a..118eaf17 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
+    from .image_inpainting_pipeline import ImageInpaintingPipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
     from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
@@ -99,6 +100,7 @@ else:
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generation_pipeline':
         ['Image2ImageGenerationPipeline'],
+        'image_inpainting_pipeline': ['ImageInpaintingPipeline'],
         'ocr_detection_pipeline': ['OCRDetectionPipeline'],
         'ocr_recognition_pipeline': ['OCRRecognitionPipeline'],
         'skin_retouching_pipeline': ['SkinRetouchingPipeline'],
diff --git a/modelscope/pipelines/cv/image_inpainting_pipeline.py b/modelscope/pipelines/cv/image_inpainting_pipeline.py
new file mode 100644
index 00000000..6ae0d63e
--- /dev/null
+++ b/modelscope/pipelines/cv/image_inpainting_pipeline.py
@@ -0,0 +1,146 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+from torch.utils.data._utils.collate import default_collate
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_inpainting import FFTInpainting
+from modelscope.models.cv.image_inpainting.refinement import refine_predict
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_inpainting, module_name=Pipelines.image_inpainting)
+class ImageInpaintingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: str,
+                 pad_out_to_modulo=8,
+                 refine=False,
+                 **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        assert isinstance(model, str), 'model must be a single str'
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        self.refine = refine
+        logger.info(f'loading model from dir {model}')
+        self.infer_model = FFTInpainting(model, predict_only=True)
+        if not self.refine:
+            self.infer_model.to(self.device)
+        self.infer_model.eval()
+        logger.info(f'loading model done, refinement is set to {self.refine}')
+        self.pad_out_to_modulo = pad_out_to_modulo
+
+    def move_to_device(self, obj, device):
+        if isinstance(obj, nn.Module):
+            return obj.to(device)
+        if torch.is_tensor(obj):
+            return obj.to(device)
+        if isinstance(obj, (tuple, list)):
+            return [self.move_to_device(el, device) for el in obj]
+        if isinstance(obj, dict):
+            return {
+                name: self.move_to_device(val, device)
+                for name, val in obj.items()
+            }
+        raise ValueError(f'Unexpected type {type(obj)}')
+
+    def transforms(self, img):
+        if img.ndim == 3:
+            img = np.transpose(img, (2, 0, 1))
+        out_img = img.astype('float32') / 255
+        return out_img
+
+    def ceil_modulo(self, x, mod):
+        if x % mod == 0:
+            return x
+        return (x // mod + 1) * mod
+
+    def pad_img_to_modulo(self, img, mod):
+        channels, height, width = img.shape
+        out_height = self.ceil_modulo(height, mod)
+        out_width = self.ceil_modulo(width, mod)
+        return np.pad(
+            img, ((0, 0), (0, out_height - height), (0, out_width - width)),
+            mode='symmetric')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            image_name, mask_name = input.split('+')
+            img = LoadImage.convert_to_ndarray(image_name)
+            img = self.transforms(img)
+            mask = np.array(LoadImage(mode='L')(mask_name)['img'])
+            mask = self.transforms(mask)
+        elif isinstance(input, PIL.Image.Image):
+            img = input.crop((0, 0, int(input.width / 2), input.height))
+            img = self.transforms(np.array(img))
+            mask = input.crop((int(input.width / 2), 0, input.width,
+                               input.height)).convert('L')
+            mask = self.transforms(np.array(mask))
+        else:
+            raise TypeError('input should be either str or PIL.Image')
+        result = dict(image=img, mask=mask[None, ...])
+
+        if self.pad_out_to_modulo is not None and self.pad_out_to_modulo > 1:
+            result['unpad_to_size'] = result['image'].shape[1:]
+            result['image'] = self.pad_img_to_modulo(result['image'],
+                                                     self.pad_out_to_modulo)
+            result['mask'] = self.pad_img_to_modulo(result['mask'],
+                                                    self.pad_out_to_modulo)
+
+        # Since Pipeline use default torch.no_grad() for performing forward func.
+        # We conduct inference here in case of doing training for refinement.
+        result = self.perform_inference(result)
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.OUTPUT_IMG: input}
+
+    def perform_inference(self, data):
+        batch = default_collate([data])
+        if self.refine:
+            assert 'unpad_to_size' in batch, 'Unpadded size is required for the refinement'
+            assert 'cuda' in str(self.device), 'GPU is required for refinement'
+            gpu_ids = str(self.device).split(':')[-1]
+            cur_res = refine_predict(
+                batch,
+                self.infer_model,
+                gpu_ids=gpu_ids,
+                modulo=self.pad_out_to_modulo,
+                n_iters=15,
+                lr=0.002,
+                min_side=512,
+                max_scales=3,
+                px_budget=900000)
+            cur_res = cur_res[0].permute(1, 2, 0).detach().cpu().numpy()
+        else:
+            with torch.no_grad():
+                batch = self.move_to_device(batch, self.device)
+                batch['mask'] = (batch['mask'] > 0) * 1
+                batch = self.infer_model(batch)
+                cur_res = batch['inpainted'][0].permute(
+                    1, 2, 0).detach().cpu().numpy()
+                unpad_to_size = batch.get('unpad_to_size', None)
+                if unpad_to_size is not None:
+                    orig_height, orig_width = unpad_to_size
+                    cur_res = cur_res[:orig_height, :orig_width]
+
+        cur_res = np.clip(cur_res * 255, 0, 255).astype('uint8')
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR)
+        return cur_res
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index a632642a..86917261 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
                      ImagePortraitEnhancementTrainer,
-                     MovieSceneSegmentationTrainer)
+                     MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
@@ -22,7 +22,8 @@ else:
         'builder': ['build_trainer'],
         'cv': [
             'ImageInstanceSegmentationTrainer',
-            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
+            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer',
+            'ImageInpaintingTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 4c65870e..d09fd75c 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -8,6 +8,7 @@ if TYPE_CHECKING:
         ImageInstanceSegmentationTrainer
     from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
     from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
+    from .image_inpainting_trainer import ImageInpaintingTrainer
 
 else:
     _import_structure = {
@@ -15,7 +16,8 @@ else:
         ['ImageInstanceSegmentationTrainer'],
         'image_portrait_enhancement_trainer':
         ['ImagePortraitEnhancementTrainer'],
-        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer']
+        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
+        'image_inpainting_trainer': ['ImageInpaintingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/image_inpainting_trainer.py b/modelscope/trainers/cv/image_inpainting_trainer.py
new file mode 100644
index 00000000..74d1ed9f
--- /dev/null
+++ b/modelscope/trainers/cv/image_inpainting_trainer.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import time
+from collections.abc import Mapping
+
+from torch import distributed as dist
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
+                                       Tasks, TrainerStages)
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.file_utils import func_receive_dict_inputs
+
+
+@TRAINERS.register_module(module_name=Trainers.image_inpainting)
+class ImageInpaintingTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
+
+    def train_loop(self, data_loader):
+        """ Training loop used by `EpochBasedTrainer.train()`
+        """
+        self.invoke_hook(TrainerStages.before_run)
+        self._epoch = 0
+        self.model.train()
+        for _ in range(self._epoch, self._max_epochs):
+            self.invoke_hook(TrainerStages.before_train_epoch)
+            for i, data_batch in enumerate(data_loader):
+                data_batch = to_device(data_batch, self.device)
+                self.data_batch = data_batch
+                self._inner_iter = i
+                for idx in range(2):
+                    self.invoke_hook(TrainerStages.before_train_iter)
+                    self.train_step(self.model, data_batch, idx)
+                    self.invoke_hook(TrainerStages.after_train_iter)
+                del self.data_batch
+                self._iter += 1
+                self._mode = ModeKeys.TRAIN
+
+                if i + 1 >= self.iters_per_epoch:
+                    break
+
+            self.invoke_hook(TrainerStages.after_train_epoch)
+            self._epoch += 1
+
+        self.invoke_hook(TrainerStages.after_run)
+
+    def train_step(self, model, inputs, idx):
+        """ Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`TorchModel`): The model to train.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `torch.Tensor`: The tensor with training loss on this batch.
+        """
+        # EvaluationHook will do evaluate and change mode to val, return to train mode
+        # TODO: find more pretty way to change mode
+        model.train()
+        self._mode = ModeKeys.TRAIN
+        # call model forward but not __call__ to skip postprocess
+        if isinstance(inputs,
+                      Mapping) and not func_receive_dict_inputs(model.forward):
+            train_outputs = model.model._do_step(**inputs, optimizer_idx=idx)
+        else:
+            train_outputs = model.model._do_step(inputs, optimizer_idx=idx)
+
+        if not isinstance(train_outputs, dict):
+            raise TypeError('"model.forward()" must return a dict')
+
+        # add model output info to log
+        if 'log_vars' not in train_outputs:
+            default_keys_pattern = ['loss']
+            match_keys = set([])
+            for key_p in default_keys_pattern:
+                match_keys.update(
+                    [key for key in train_outputs.keys() if key_p in key])
+
+            log_vars = {}
+            for key in match_keys:
+                value = train_outputs.get(key, None)
+                if value is not None:
+                    if dist.is_available() and dist.is_initialized():
+                        value = value.data.clone()
+                        dist.all_reduce(value.div_(dist.get_world_size()))
+                    log_vars.update({key: value.item()})
+            self.log_buffer.update(log_vars)
+        else:
+            self.log_buffer.update(train_outputs['log_vars'])
+
+        self.train_outputs = train_outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2331dc85..2a5ac694 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -47,6 +47,8 @@ class CVTasks(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
 
+    crowd_counting = 'crowd-counting'
+
     # image editing
     skin_retouching = 'skin-retouching'
     image_super_resolution = 'image-super-resolution'
@@ -54,6 +56,7 @@ class CVTasks(object):
     image_color_enhancement = 'image-color-enhancement'
     image_denoising = 'image-denoising'
     image_portrait_enhancement = 'image-portrait-enhancement'
+    image_inpainting = 'image-inpainting'
 
     # image generation
     image_to_image_translation = 'image-to-image-translation'
@@ -72,7 +75,6 @@ class CVTasks(object):
     video_category = 'video-category'
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
-    crowd_counting = 'crowd-counting'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
     # video editing
diff --git a/requirements/cv.txt b/requirements/cv.txt
index f907256d..e6ffb5ff 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -7,6 +7,8 @@ ffmpeg-python>=0.2.0
 ftfy
 imageio>=2.9.0
 imageio-ffmpeg>=0.4.2
+imgaug>=0.4.0
+kornia>=0.5.0
 lmdb
 lpips
 ml_collections
diff --git a/tests/pipelines/test_image_inpainting.py b/tests/pipelines/test_image_inpainting.py
new file mode 100644
index 00000000..b89ce399
--- /dev/null
+++ b/tests/pipelines/test_image_inpainting.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import torch
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageInpaintingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.input_location = 'data/test/images/image_inpainting/image_inpainting.png'
+        self.input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask.png'
+        self.model_id = 'damo/cv_fft_inpainting_lama'
+
+    def save_result(self, result):
+        vis_img = result[OutputKeys.OUTPUT_IMG]
+        cv2.imwrite('result.png', vis_img)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_inpainting(self):
+        inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+    def test_inpainting_with_refinement(self):
+        # if input image is HR, set refine=True is more better
+        inpainting = pipeline(
+            Tasks.image_inpainting, model=self.model_id, refine=True)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_inpainting_with_image(self):
+        inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
+        img = Image.open(self.input_location).convert('RGB')
+        mask = Image.open(self.input_mask_location).convert('RGB')
+        img_new = Image.new('RGB', (img.width + mask.width, img.height))
+        img_new.paste(img, (0, 0))
+        img_new.paste(mask, (img.width, 0))
+        result = inpainting(img_new)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_inpainting_with_default_task(self):
+        inpainting = pipeline(Tasks.image_inpainting)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index 4c571b7f..b4149dc9 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -10,6 +10,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_easycv_trainer.py
   - test_segformer.py
   - test_segmentation_pipeline.py
+  - test_image_inpainting.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/test_image_inpainting_trainer.py b/tests/trainers/test_image_inpainting_trainer.py
new file mode 100644
index 00000000..807fe64f
--- /dev/null
+++ b/tests/trainers/test_image_inpainting_trainer.py
@@ -0,0 +1,84 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.image_inpainting import FFTInpainting
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageInpaintingTrainerTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_fft_inpainting_lama'
+        self.cache_path = snapshot_download(self.model_id)
+        cfg = Config.from_file(
+            os.path.join(self.cache_path, ModelFile.CONFIGURATION))
+
+        train_data_cfg = ConfigDict(
+            name='PlacesToydataset',
+            split='train',
+            mask_gen_kwargs=cfg.dataset.mask_gen_kwargs,
+            out_size=cfg.dataset.train_out_size,
+            test_mode=False)
+
+        test_data_cfg = ConfigDict(
+            name='PlacesToydataset',
+            split='test',
+            mask_gen_kwargs=cfg.dataset.mask_gen_kwargs,
+            out_size=cfg.dataset.val_out_size,
+            test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            mask_gen_kwargs=train_data_cfg.mask_gen_kwargs,
+            out_size=train_data_cfg.out_size,
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            mask_gen_kwargs=test_data_cfg.mask_gen_kwargs,
+            out_size=test_data_cfg.out_size,
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.image_inpainting, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2bfdbbc9d0d77372ccfeb85745c2bcf8c736b534 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:23:36 +0800
Subject: [PATCH 12/57] [to #42322933]update fer to satisfy demo service
 requirements         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10357094

---
 .../models/cv/face_detection/mogface/models/detectors.py   | 2 ++
 .../pipelines/cv/facial_expression_recognition_pipeline.py | 5 ++++-
 modelscope/utils/cv/image_utils.py                         | 7 +------
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py
index 5ae67104..8c1d9150 100644
--- a/modelscope/models/cv/face_detection/mogface/models/detectors.py
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -1,3 +1,5 @@
+# The implementation is based on MogFace, available at
+# https://github.com/damo-cv/MogFace
 import os
 
 import cv2
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index 1b1f13d1..b598a457 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -45,6 +45,9 @@ class FacialExpressionRecognitionPipeline(Pipeline):
 
         # face detect pipeline
         det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.map_list = [
+            'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
+        ]
         self.face_detection = pipeline(
             Tasks.face_detection, model=det_model_id)
 
@@ -122,7 +125,7 @@ class FacialExpressionRecognitionPipeline(Pipeline):
         labels = result[1].tolist()
         return {
             OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: labels,
+            OutputKeys.LABELS: self.map_list[labels]
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 98ba533e..ad0d6c8e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -113,12 +113,7 @@ def draw_face_detection_no_lm_result(img_path, detection_result):
 
 
 def draw_facial_expression_result(img_path, facial_expression_result):
-    label_idx = facial_expression_result[OutputKeys.LABELS]
-    map_list = [
-        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
-    ]
-    label = map_list[label_idx]
-
+    label = facial_expression_result[OutputKeys.LABELS]
     img = cv2.imread(img_path)
     assert img is not None, f"Can't read img: {img_path}"
     cv2.putText(

From 0d97f8959d2095ecfd4b43bb4eb607534474a44f Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:24:19 +0800
Subject: [PATCH 13/57] [to #42322933] test: unify kws pipeline input type to
 AUDIO         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10362437

---
 .../test_key_word_spotting_farfield.py        | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index f8c167de..bf61c9e7 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -22,18 +22,14 @@ class KWSFarfieldTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_normal(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
-        result = kws(inputs)
+        result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE))
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_mono(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {
-            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
-        }
-        result = kws(inputs)
+        result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO))
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
@@ -44,17 +40,6 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_output(self):
-        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {
-            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE),
-            'output_file': 'output.wav'
-        }
-        result = kws(inputs)
-        self.assertEqual(len(result['kws_list']), 5)
-        print(result['kws_list'][-1])
-
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_input_bytes(self):
         with open(os.path.join(os.getcwd(), TEST_SPEECH_FILE), 'rb') as f:

From da5d5cd10bf8bb75ff9fde2df3f0112308c35cc7 Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:37:57 +0800
Subject: [PATCH 14/57] [to #42322933]add copyright info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加ocr部分代码的copyright信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10342392
---
 .../cv/ocr_utils/model_convnext_transformer.py     |  1 +
 .../model_resnet_mutex_v4_linewithchar.py          |  2 ++
 .../pipelines/cv/ocr_utils/ocr_modules/convnext.py | 10 ++--------
 .../cv/ocr_utils/ocr_modules/timm_tinyc.py         |  6 ++----
 .../pipelines/cv/ocr_utils/ocr_modules/vitstr.py   |  9 ++-------
 modelscope/pipelines/cv/ocr_utils/ops.py           |  2 ++
 modelscope/pipelines/cv/ocr_utils/resnet18_v1.py   | 14 ++++++++++++++
 modelscope/pipelines/cv/ocr_utils/resnet_utils.py  | 14 ++++++++++++++
 modelscope/pipelines/cv/ocr_utils/utils.py         |  1 +
 9 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
index cf5e2fe1..6ecff7ef 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
index d03ff405..2c2d5b00 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from SegLink,
+# publicly available at https://github.com/bgshih/seglink
 import tensorflow as tf
 
 from . import ops, resnet18_v1, resnet_utils
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
index c2059107..c0e30616 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
@@ -1,11 +1,5 @@
-""" Contains various versions of ConvNext Networks.
-ConvNext Networks (ConvNext) were proposed in:
-  Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell and Saining Xie
-  A ConvNet for the 2020s. CVPR 2022.
-Compared to https://github.com/facebookresearch/ConvNeXt,
-we obtain different ConvNext variants by changing the network depth, width,
-feature number, and downsample ratio.
-"""
+# Part of the implementation is borrowed and modified from ConvNext,
+# publicly available at https://github.com/facebookresearch/ConvNeXt
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
index f54c0e78..555b1e42 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
@@ -1,7 +1,5 @@
-'''Referenced from rwightman's pytorch-image-models(timm).
-Github: https://github.com/rwightman/pytorch-image-models
-We use some modules and modify the parameters according to our network.
-'''
+# Part of the implementation is borrowed and modified from timm,
+# publicly available at https://github.com/rwightman/pytorch-image-models
 import collections.abc
 import logging
 import math
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
index e7d96574..5ce3aeca 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
@@ -1,10 +1,5 @@
-""" Contains various versions of ViTSTR.
-ViTSTR were proposed in:
-  Rowel Atienza
-  Vision transformer for fast and efficient scene text recognition. ICDAR 2021.
-Compared to https://github.com/roatienza/deep-text-recognition-benchmark,
-we obtain different ViTSTR variants by changing the network patch_size and in_chans.
-"""
+# Part of the implementation is borrowed and modified from ViTSTR,
+# publicly available at https://github.com/roatienza/deep-text-recognition-benchmark
 from __future__ import absolute_import, division, print_function
 import logging
 from copy import deepcopy
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index 09807b10..a36838a6 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from SegLink,
+# publicly available at https://github.com/bgshih/seglink
 import math
 import os
 import shutil
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
index 7930c5a3..85f9faca 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -1,3 +1,17 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Contains definitions for the original form of Residual Networks.
 The 'v1' residual networks (ResNets) implemented in this module were proposed
 by:
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
index 0a9af224..2ccbd038 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Contains building blocks for various versions of Residual Networks.
 Residual networks (ResNets) were proposed in:
   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
index be8e3371..1d0fb297 100644
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
 import numpy as np
 

From 922f4c589b8da4111e787cd38d0a53518aaa8ada Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:46:30 +0800
Subject: [PATCH 15/57] =?UTF-8?q?[to=20#42322933]=E5=9B=BE=E5=83=8F?=
 =?UTF-8?q?=E5=8E=BB=E5=99=AAusing=20msdataset=20to=20load=20dataset=20=20?=
 =?UTF-8?q?=20=20=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-M?=
 =?UTF-8?q?aaS/MaaS-lib/codereview/10338265?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metrics/image_denoise_metric.py    | 140 +++++++++++++++-
 .../cv/image_denoise/nafnet/NAFNet_arch.py    |   5 +
 .../cv/image_denoise/nafnet/arch_util.py      |   5 +
 .../image_denoise/nafnet_for_image_denoise.py |   1 +
 .../image_denoise_data/data_utils.py          | 152 ------------------
 .../image_denoise_dataset.py                  |  78 ---------
 .../sidd_image_denoising}/__init__.py         |   4 +-
 .../sidd_image_denoising/data_utils.py        |  46 ++++++
 .../sidd_image_denoising_dataset.py           |  62 +++++++
 .../sidd_image_denoising}/transforms.py       |   0
 .../pipelines/cv/image_denoise_pipeline.py    |   2 +-
 tests/pipelines/test_image_denoise.py         |  25 ++-
 tests/trainers/test_image_denoise_trainer.py  |  24 ++-
 13 files changed, 284 insertions(+), 260 deletions(-)
 delete mode 100644 modelscope/msdatasets/image_denoise_data/data_utils.py
 delete mode 100644 modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
 rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/__init__.py (73%)
 create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
 create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
 rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/transforms.py (100%)

diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py
index 94ec9dc7..c6df8df1 100644
--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -1,7 +1,9 @@
+# The code is modified based on BasicSR metrics:
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
 from typing import Dict
 
+import cv2
 import numpy as np
-from skimage.metrics import peak_signal_noise_ratio, structural_similarity
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
@@ -34,12 +36,138 @@ class ImageDenoiseMetric(Metric):
     def evaluate(self):
         psnr_list, ssim_list = [], []
         for (pred, label) in zip(self.preds, self.labels):
-            psnr_list.append(
-                peak_signal_noise_ratio(label[0], pred[0], data_range=255))
-            ssim_list.append(
-                structural_similarity(
-                    label[0], pred[0], multichannel=True, data_range=255))
+            psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0))
+            ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0))
         return {
             MetricKeys.PSNR: np.mean(psnr_list),
             MetricKeys.SSIM: np.mean(ssim_list)
         }
+
+
+def reorder_image(img, input_order='HWC'):
+    """Reorder images to 'HWC' order.
+    If the input_order is (h, w), return (h, w, 1);
+    If the input_order is (c, h, w), return (h, w, c);
+    If the input_order is (h, w, c), return as it is.
+    Args:
+        img (ndarray): Input image.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            If the input image shape is (h, w), input_order will not have
+            effects. Default: 'HWC'.
+    Returns:
+        ndarray: reordered image.
+    """
+
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
+        )
+    if len(img.shape) == 2:
+        img = img[..., None]
+    if input_order == 'CHW':
+        img = img.transpose(1, 2, 0)
+    return img
+
+
+def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs):
+    """Calculate PSNR (Peak Signal-to-Noise Ratio).
+    Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    Args:
+        img (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'.
+    Returns:
+        float: PSNR result.
+    """
+
+    assert img.shape == img2.shape, (
+        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
+        )
+    img = reorder_image(img, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    mse = np.mean((img - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 10. * np.log10(255. * 255. / mse)
+
+
+def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs):
+    """Calculate SSIM (structural similarity).
+    ``Paper: Image quality assessment: From error visibility to structural similarity``
+    The results are the same as that of the official released MATLAB code in
+    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
+    For three-channel images, SSIM is calculated for each channel and then
+    averaged.
+    Args:
+        img (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+    Returns:
+        float: SSIM result.
+    """
+
+    assert img.shape == img2.shape, (
+        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
+        )
+    img = reorder_image(img, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    ssims = []
+    for i in range(img.shape[2]):
+        ssims.append(_ssim(img[..., i], img2[..., i]))
+    return np.array(ssims).mean()
+
+
+def _ssim(img, img2):
+    """Calculate SSIM (structural similarity) for one channel images.
+    It is called by func:`calculate_ssim`.
+    Args:
+        img (ndarray): Images with range [0, 255] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
+    Returns:
+        float: SSIM result.
+    """
+
+    c1 = (0.01 * 255)**2
+    c2 = (0.03 * 255)**2
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+
+    mu1 = cv2.filter2D(img, -1, window)[5:-5,
+                                        5:-5]  # valid mode for window size 11
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+    tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
+    tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+    ssim_map = tmp1 / tmp2
+    return ssim_map.mean()
diff --git a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
index 5b4e8ce1..c4de0729 100644
--- a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
+++ b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
@@ -1,3 +1,8 @@
+# ------------------------------------------------------------------------
+# Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_denoise/nafnet/arch_util.py b/modelscope/models/cv/image_denoise/nafnet/arch_util.py
index df394dd5..2d406141 100644
--- a/modelscope/models/cv/image_denoise/nafnet/arch_util.py
+++ b/modelscope/models/cv/image_denoise/nafnet/arch_util.py
@@ -1,3 +1,8 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index c484b37b..a6fbf22f 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from copy import deepcopy
 from typing import Any, Dict, Union
diff --git a/modelscope/msdatasets/image_denoise_data/data_utils.py b/modelscope/msdatasets/image_denoise_data/data_utils.py
deleted file mode 100644
index dd735830..00000000
--- a/modelscope/msdatasets/image_denoise_data/data_utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# ------------------------------------------------------------------------
-# Modified from BasicSR (https://github.com/xinntao/BasicSR)
-# Copyright 2018-2020 BasicSR Authors
-# ------------------------------------------------------------------------
-import os
-from os import path as osp
-
-import cv2
-import numpy as np
-import torch
-
-from .transforms import mod_crop
-
-
-def img2tensor(imgs, bgr2rgb=True, float32=True):
-    """Numpy array to tensor.
-    Args:
-        imgs (list[ndarray] | ndarray): Input images.
-        bgr2rgb (bool): Whether to change bgr to rgb.
-        float32 (bool): Whether to change to float32.
-    Returns:
-        list[tensor] | tensor: Tensor images. If returned results only have
-            one element, just return tensor.
-    """
-
-    def _totensor(img, bgr2rgb, float32):
-        if img.shape[2] == 3 and bgr2rgb:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        img = torch.from_numpy(img.transpose(2, 0, 1))
-        if float32:
-            img = img.float()
-        return img
-
-    if isinstance(imgs, list):
-        return [_totensor(img, bgr2rgb, float32) for img in imgs]
-    else:
-        return _totensor(imgs, bgr2rgb, float32)
-
-
-def scandir(dir_path, keyword=None, recursive=False, full_path=False):
-    """Scan a directory to find the interested files.
-    Args:
-        dir_path (str): Path of the directory.
-        keyword (str | tuple(str), optional): File keyword that we are
-            interested in. Default: None.
-        recursive (bool, optional): If set to True, recursively scan the
-            directory. Default: False.
-        full_path (bool, optional): If set to True, include the dir_path.
-            Default: False.
-    Returns:
-        A generator for all the interested files with relative pathes.
-    """
-
-    if (keyword is not None) and not isinstance(keyword, (str, tuple)):
-        raise TypeError('"suffix" must be a string or tuple of strings')
-
-    root = dir_path
-
-    def _scandir(dir_path, keyword, recursive):
-        for entry in os.scandir(dir_path):
-            if not entry.name.startswith('.') and entry.is_file():
-                if full_path:
-                    return_path = entry.path
-                else:
-                    return_path = osp.relpath(entry.path, root)
-
-                if keyword is None:
-                    yield return_path
-                elif keyword in return_path:
-                    yield return_path
-            else:
-                if recursive:
-                    yield from _scandir(
-                        entry.path, keyword=keyword, recursive=recursive)
-                else:
-                    continue
-
-    return _scandir(dir_path, keyword=keyword, recursive=recursive)
-
-
-def padding(img_lq, img_gt, gt_size):
-    h, w, _ = img_lq.shape
-
-    h_pad = max(0, gt_size - h)
-    w_pad = max(0, gt_size - w)
-
-    if h_pad == 0 and w_pad == 0:
-        return img_lq, img_gt
-
-    img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
-    img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
-    return img_lq, img_gt
-
-
-def read_img_seq(path, require_mod_crop=False, scale=1):
-    """Read a sequence of images from a given folder path.
-    Args:
-        path (list[str] | str): List of image paths or image folder path.
-        require_mod_crop (bool): Require mod crop for each image.
-            Default: False.
-        scale (int): Scale factor for mod_crop. Default: 1.
-    Returns:
-        Tensor: size (t, c, h, w), RGB, [0, 1].
-    """
-    if isinstance(path, list):
-        img_paths = path
-    else:
-        img_paths = sorted(list(scandir(path, full_path=True)))
-    imgs = [cv2.imread(v).astype(np.float32) / 255. for v in img_paths]
-    if require_mod_crop:
-        imgs = [mod_crop(img, scale) for img in imgs]
-    imgs = img2tensor(imgs, bgr2rgb=True, float32=True)
-    imgs = torch.stack(imgs, dim=0)
-    return imgs
-
-
-def paired_paths_from_folder(folders, keys, filename_tmpl):
-    """Generate paired paths from folders.
-    Args:
-        folders (list[str]): A list of folder path. The order of list should
-            be [input_folder, gt_folder].
-        keys (list[str]): A list of keys identifying folders. The order should
-            be in consistent with folders, e.g., ['lq', 'gt'].
-        filename_tmpl (str): Template for each filename. Note that the
-            template excludes the file extension. Usually the filename_tmpl is
-            for files in the input folder.
-    Returns:
-        list[str]: Returned path list.
-    """
-    assert len(folders) == 2, (
-        'The len of folders should be 2 with [input_folder, gt_folder]. '
-        f'But got {len(folders)}')
-    assert len(keys) == 2, (
-        'The len of keys should be 2 with [input_key, gt_key]. '
-        f'But got {len(keys)}')
-    input_folder, gt_folder = folders
-    input_key, gt_key = keys
-
-    input_paths = list(scandir(input_folder, keyword='NOISY', recursive=True))
-    gt_paths = list(scandir(gt_folder, keyword='GT', recursive=True))
-    assert len(input_paths) == len(gt_paths), (
-        f'{input_key} and {gt_key} datasets have different number of images: '
-        f'{len(input_paths)}, {len(gt_paths)}.')
-    paths = []
-    for idx in range(len(gt_paths)):
-        gt_path = os.path.join(gt_folder, gt_paths[idx])
-        input_path = os.path.join(input_folder, gt_path.replace('GT', 'NOISY'))
-
-        paths.append(
-            dict([(f'{input_key}_path', input_path),
-                  (f'{gt_key}_path', gt_path)]))
-    return paths
diff --git a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py b/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
deleted file mode 100644
index 96b777e6..00000000
--- a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-from typing import Callable, List, Optional, Tuple, Union
-
-import cv2
-import numpy as np
-from torch.utils import data
-
-from .data_utils import img2tensor, padding, paired_paths_from_folder
-from .transforms import augment, paired_random_crop
-
-
-def default_loader(path):
-    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
-
-
-class PairedImageDataset(data.Dataset):
-    """Paired image dataset for image restoration.
-    """
-
-    def __init__(self, opt, root, is_train):
-        super(PairedImageDataset, self).__init__()
-        self.opt = opt
-        self.is_train = is_train
-        self.gt_folder, self.lq_folder = os.path.join(
-            root, opt.dataroot_gt), os.path.join(root, opt.dataroot_lq)
-
-        if opt.filename_tmpl is not None:
-            self.filename_tmpl = opt.filename_tmpl
-        else:
-            self.filename_tmpl = '{}'
-        self.paths = paired_paths_from_folder([self.lq_folder, self.gt_folder],
-                                              ['lq', 'gt'], self.filename_tmpl)
-
-    def __getitem__(self, index):
-        scale = self.opt.scale
-
-        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
-        # image range: [0, 1], float32.
-        gt_path = self.paths[index]['gt_path']
-        img_gt = default_loader(gt_path)
-        lq_path = self.paths[index]['lq_path']
-        img_lq = default_loader(lq_path)
-
-        # augmentation for training
-        # if self.is_train:
-        gt_size = self.opt.gt_size
-        # padding
-        img_gt, img_lq = padding(img_gt, img_lq, gt_size)
-
-        # random crop
-        img_gt, img_lq = paired_random_crop(img_gt, img_lq, gt_size, scale)
-
-        # flip, rotation
-        img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip,
-                                 self.opt.use_rot)
-
-        # BGR to RGB, HWC to CHW, numpy to tensor
-        img_gt, img_lq = img2tensor([img_gt, img_lq],
-                                    bgr2rgb=True,
-                                    float32=True)
-
-        return {
-            'input': img_lq,
-            'target': img_gt,
-            'input_path': lq_path,
-            'target_path': gt_path
-        }
-
-    def __len__(self):
-        return len(self.paths)
-
-    def to_torch_dataset(
-        self,
-        columns: Union[str, List[str]] = None,
-        preprocessors: Union[Callable, List[Callable]] = None,
-        **format_kwargs,
-    ):
-        return self
diff --git a/modelscope/msdatasets/image_denoise_data/__init__.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
similarity index 73%
rename from modelscope/msdatasets/image_denoise_data/__init__.py
rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
index ba1d2df8..5376cd7c 100644
--- a/modelscope/msdatasets/image_denoise_data/__init__.py
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .image_denoise_dataset import PairedImageDataset
+    from .sidd_image_denoising_dataset import SiddImageDenoisingDataset
 
 else:
     _import_structure = {
-        'image_denoise_dataset': ['PairedImageDataset'],
+        'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
     }
 
     import sys
diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
new file mode 100644
index 00000000..33fce4c8
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
@@ -0,0 +1,46 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
+import cv2
+import torch
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
+
+
+def padding(img_lq, img_gt, gt_size):
+    h, w, _ = img_lq.shape
+
+    h_pad = max(0, gt_size - h)
+    w_pad = max(0, gt_size - w)
+
+    if h_pad == 0 and w_pad == 0:
+        return img_lq, img_gt
+
+    img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
+    img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
+    return img_lq, img_gt
diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
new file mode 100644
index 00000000..3f0cdae0
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from .data_utils import img2tensor, padding
+from .transforms import augment, paired_random_crop
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_denoising, module_name=Models.nafnet)
+class SiddImageDenoisingDataset(TorchTaskDataset):
+    """Paired image dataset for image restoration.
+    """
+
+    def __init__(self, dataset, opt, is_train):
+        self.dataset = dataset
+        self.opt = opt
+        self.is_train = is_train
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32.
+        item_dict = self.dataset[index]
+        gt_path = item_dict['Clean Image:FILE']
+        img_gt = default_loader(gt_path)
+        lq_path = item_dict['Noisy Image:FILE']
+        img_lq = default_loader(lq_path)
+
+        # augmentation for training
+        if self.is_train:
+            gt_size = self.opt.gt_size
+            # padding
+            img_gt, img_lq = padding(img_gt, img_lq, gt_size)
+
+            # random crop
+            img_gt, img_lq = paired_random_crop(
+                img_gt, img_lq, gt_size, scale=1)
+
+            # flip, rotation
+            img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip,
+                                     self.opt.use_rot)
+
+        # BGR to RGB, HWC to CHW, numpy to tensor
+        img_gt, img_lq = img2tensor([img_gt, img_lq],
+                                    bgr2rgb=True,
+                                    float32=True)
+
+        return {'input': img_lq, 'target': img_gt}
diff --git a/modelscope/msdatasets/image_denoise_data/transforms.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py
similarity index 100%
rename from modelscope/msdatasets/image_denoise_data/transforms.py
rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py
diff --git a/modelscope/pipelines/cv/image_denoise_pipeline.py b/modelscope/pipelines/cv/image_denoise_pipeline.py
index a11abf36..34ac1e81 100644
--- a/modelscope/pipelines/cv/image_denoise_pipeline.py
+++ b/modelscope/pipelines/cv/image_denoise_pipeline.py
@@ -105,4 +105,4 @@ class ImageDenoisePipeline(Pipeline):
     def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
         output_img = (input['output_tensor'].squeeze(0) * 255).cpu().permute(
             1, 2, 0).numpy().astype('uint8')
-        return {OutputKeys.OUTPUT_IMG: output_img}
+        return {OutputKeys.OUTPUT_IMG: output_img[:, :, ::-1]}
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index bf8cfd0f..d95dd343 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -2,8 +2,6 @@
 
 import unittest
 
-from PIL import Image
-
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
@@ -20,16 +18,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.image_denoising
         self.model_id = 'damo/cv_nafnet_image-denoise_sidd'
 
-    demo_image_path = 'data/test/images/noisy-demo-1.png'
+    demo_image_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/noisy-demo-0.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         pipeline = ImageDenoisePipeline(cache_path)
+        pipeline.group_key = self.task
         denoise_img = pipeline(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -37,9 +35,8 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         model = Model.from_pretrained(self.model_id)
         pipeline_ins = pipeline(task=Tasks.image_denoising, model=model)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -47,18 +44,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(
             task=Tasks.image_denoising, model=self.model_id)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.image_denoising)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index 261ee4ed..0bcb8930 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -6,10 +6,12 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.cv.image_denoise import NAFNetForImageDenoise
-from modelscope.msdatasets.image_denoise_data import PairedImageDataset
+from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.task_datasets.sidd_image_denoising import \
+    SiddImageDenoisingDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
@@ -28,10 +30,20 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         self.cache_path = snapshot_download(self.model_id)
         self.config = Config.from_file(
             os.path.join(self.cache_path, ModelFile.CONFIGURATION))
-        self.dataset_train = PairedImageDataset(
-            self.config.dataset, self.cache_path, is_train=True)
-        self.dataset_val = PairedImageDataset(
-            self.config.dataset, self.cache_path, is_train=False)
+        dataset_train = MsDataset.load(
+            'SIDD',
+            namespace='huizheng',
+            split='validation',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        dataset_val = MsDataset.load(
+            'SIDD',
+            namespace='huizheng',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        self.dataset_train = SiddImageDenoisingDataset(
+            dataset_train, self.config.dataset, is_train=True)
+        self.dataset_val = SiddImageDenoisingDataset(
+            dataset_val, self.config.dataset, is_train=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)

From 800588b8a6f4a867f8fd58cc05206f780e13dc9c Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 12 Oct 2022 10:53:47 +0800
Subject: [PATCH 16/57] [to #42322933]add licence on MogFace         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10335569


From 42be514bac5f985d6d8ce710646e2a79e3d81d39 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:17:11 +0800
Subject: [PATCH 17/57] [to #42322933]update fer to satisfy demo service
 requirements         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10372291

---
 .../pipelines/cv/facial_expression_recognition_pipeline.py  | 6 +-----
 modelscope/utils/cv/image_utils.py                          | 4 +++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index b598a457..3c85ae62 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -122,11 +122,7 @@ class FacialExpressionRecognitionPipeline(Pipeline):
         result = self.fer(input)
         assert result is not None
         scores = result[0].tolist()
-        labels = result[1].tolist()
-        return {
-            OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: self.map_list[labels]
-        }
+        return {OutputKeys.SCORES: scores, OutputKeys.LABELS: self.map_list}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index ad0d6c8e..eab74688 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -113,7 +113,9 @@ def draw_face_detection_no_lm_result(img_path, detection_result):
 
 
 def draw_facial_expression_result(img_path, facial_expression_result):
-    label = facial_expression_result[OutputKeys.LABELS]
+    scores = facial_expression_result[OutputKeys.SCORES]
+    labels = facial_expression_result[OutputKeys.LABELS]
+    label = labels[np.argmax(scores)]
     img = cv2.imread(img_path)
     assert img is not None, f"Can't read img: {img_path}"
     cv2.putText(

From 71459900544438b3d44bf0e922cdda64ac4d5701 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:18:35 +0800
Subject: [PATCH 18/57] [to #42322933] reivse model problem and remove history
 sql for demo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

相比于master上的tableqa，做出了如下修复：
1. 修复了schema linking中的问题。
2. 同时设置了有history sql和没有history sql的两种输入
3. 增加了sqlite执行逻辑，可以返回sql执行结果
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10365114
---
 .../models/nlp/table_question_answering.py    |  3 +-
 modelscope/outputs.py                         |  1 +
 .../nlp/table_question_answering_pipeline.py  | 61 +++++++++---
 .../preprocessors/star3/fields/database.py    | 53 ++++++++++-
 .../preprocessors/star3/fields/schema_link.py | 33 +++++--
 .../table_question_answering_preprocessor.py  |  5 +-
 modelscope/utils/nlp/nlp_utils.py             | 17 +---
 .../test_table_question_answering.py          | 94 +++++++++++++++----
 8 files changed, 206 insertions(+), 61 deletions(-)

diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index 3c91a518..c6a03ef3 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -3,9 +3,11 @@
 import os
 from typing import Dict
 
+import json
 import numpy
 import torch
 import torch.nn.functional as F
+import tqdm
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Models
@@ -82,7 +84,6 @@ class TableQuestionAnswering(Model):
 
                 if ntok.startswith('##'):
                     ntok = ntok.replace('##', '')
-
                 tok = nlu1[idx:idx + 1].lower()
                 if ntok == tok:
                     conv_dict[i] = [idx, idx + 1]
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index dd59d6fb..0f353d3d 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -37,6 +37,7 @@ class OutputKeys(object):
     WORD = 'word'
     KWS_LIST = 'kws_list'
     HISTORY = 'history'
+    QUERT_RESULT = 'query_result'
     TIMESTAMPS = 'timestamps'
     SHOT_NUM = 'shot_num'
     SCENE_NUM = 'scene_num'
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 96bfbc34..e1b2b07b 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -2,6 +2,8 @@
 import os
 from typing import Any, Dict, Union
 
+import json
+import torch
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Pipelines
@@ -230,14 +232,16 @@ class TableQuestionAnsweringPipeline(Pipeline):
                 str_sel_list.append(header_name)
                 sql_sel_list.append(header_id)
             else:
-                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
-                                    + header_name + ' )')
-                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
-                                    + header_id + ' )')
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_name + ')')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_id + ')')
 
         str_cond_list, sql_cond_list = [], []
         for cond in sql['conds']:
             header_name = header_names[cond[0]]
+            if header_name == '空列':
+                continue
             header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
             op = self.cond_ops[cond[1]]
             value = cond[2]
@@ -248,12 +252,17 @@ class TableQuestionAnsweringPipeline(Pipeline):
 
         cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
 
-        final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list),
-                                                    table['table_name'],
-                                                    cond.join(str_cond_list))
-        final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list),
-                                                      table['table_id'],
-                                                      cond.join(sql_cond_list))
+        if len(str_cond_list) != 0:
+            final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(
+                str_sel_list), table['table_name'], cond.join(str_cond_list))
+            final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(
+                sql_sel_list), table['table_id'], cond.join(sql_cond_list))
+        else:
+            final_str = 'SELECT %s FROM %s' % (', '.join(str_sel_list),
+                                               table['table_name'])
+            final_sql = 'SELECT %s FROM `%s`' % (', '.join(sql_sel_list),
+                                                 table['table_id'])
+
         sql = SQLQuery(
             string=final_str, query=final_sql, sql_result=result['sql'])
 
@@ -274,9 +283,39 @@ class TableQuestionAnsweringPipeline(Pipeline):
             history_sql=history_sql,
             result=result,
             table=self.db.tables[result['table_id']])
+        result['sql']['from'] = [result['table_id']]
         sql = self.sql_dict_to_str(
             result=result, table=self.db.tables[result['table_id']])
-        output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']}
+
+        # add sqlite
+        if self.db.is_use_sqlite:
+            try:
+                cursor = self.db.connection_obj.cursor().execute(sql.query)
+                names = [{
+                    'name':
+                    description[0],
+                    'label':
+                    self.db.tables[result['table_id']]['headerid2name'].get(
+                        description[0], description[0])
+                } for description in cursor.description]
+                cells = []
+                for res in cursor.fetchall():
+                    row = {}
+                    for name, cell in zip(names, res):
+                        row[name['name']] = cell
+                    cells.append(row)
+                tabledata = {'headers': names, 'cells': cells}
+            except Exception:
+                tabledata = {'headers': [], 'cells': []}
+        else:
+            tabledata = {'headers': [], 'cells': []}
+
+        output = {
+            OutputKeys.OUTPUT: sql,
+            OutputKeys.HISTORY: result['sql'],
+            OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
+        }
+
         return output
 
     def _collate_fn(self, data):
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
index a99800cf..3d3a1f8d 100644
--- a/modelscope/preprocessors/star3/fields/database.py
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import sqlite3
+
 import json
 import tqdm
 
@@ -7,18 +9,38 @@ from modelscope.preprocessors.star3.fields.struct import Trie
 
 class Database:
 
-    def __init__(self, tokenizer, table_file_path, syn_dict_file_path):
+    def __init__(self,
+                 tokenizer,
+                 table_file_path,
+                 syn_dict_file_path,
+                 is_use_sqlite=False):
         self.tokenizer = tokenizer
+        self.is_use_sqlite = is_use_sqlite
+        if self.is_use_sqlite:
+            self.connection_obj = sqlite3.connect(':memory:')
+            self.type_dict = {'text': 'TEXT', 'number': 'INT', 'date': 'TEXT'}
         self.tables = self.init_tables(table_file_path=table_file_path)
         self.syn_dict = self.init_syn_dict(
             syn_dict_file_path=syn_dict_file_path)
 
+    def __del__(self):
+        if self.is_use_sqlite:
+            self.connection_obj.close()
+
     def init_tables(self, table_file_path):
         tables = {}
         lines = []
-        with open(table_file_path, 'r') as fo:
-            for line in fo:
-                lines.append(line)
+        if type(table_file_path) == str:
+            with open(table_file_path, 'r') as fo:
+                for line in fo:
+                    lines.append(line)
+        elif type(table_file_path) == list:
+            for path in table_file_path:
+                with open(path, 'r') as fo:
+                    for line in fo:
+                        lines.append(line)
+        else:
+            raise ValueError()
 
         for line in tqdm.tqdm(lines, desc='Load Tables'):
             table = json.loads(line.strip())
@@ -34,6 +56,9 @@ class Database:
             headers_tokens.append(empty_column)
             table['tablelen'] = table_header_length
             table['header_tok'] = headers_tokens
+            table['headerid2name'] = {}
+            for hid, hname in zip(table['header_id'], table['header_name']):
+                table['headerid2name'][hid] = hname
 
             table['header_types'].append('null')
             table['header_units'] = [
@@ -51,6 +76,26 @@ class Database:
                     trie_set[ii].insert(word, word)
 
             table['value_trie'] = trie_set
+
+            # create sqlite
+            if self.is_use_sqlite:
+                cursor_obj = self.connection_obj.cursor()
+                cursor_obj.execute('DROP TABLE IF EXISTS %s' %
+                                   (table['table_id']))
+                header_string = ', '.join([
+                    '%s %s' %
+                    (name, self.type_dict[htype]) for name, htype in zip(
+                        table['header_id'], table['header_types'])
+                ])
+                create_table_string = 'CREATE TABLE %s (%s);' % (
+                    table['table_id'], header_string)
+                cursor_obj.execute(create_table_string)
+                for row in table['rows']:
+                    value_string = ', '.join(['"%s"' % (val) for val in row])
+                    insert_row_string = 'INSERT INTO %s VALUES(%s)' % (
+                        table['table_id'], value_string)
+                    cursor_obj.execute(insert_row_string)
+
             tables[table['table_id']] = table
 
         return tables
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
index 40613f78..7f483a1f 100644
--- a/modelscope/preprocessors/star3/fields/schema_link.py
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -287,7 +287,13 @@ class SchemaLinker:
 
         return match_len / (len(nlu_t) + 0.1)
 
-    def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict):
+    def get_entity_linking(self,
+                           tokenizer,
+                           nlu,
+                           nlu_t,
+                           tables,
+                           col_syn_dict,
+                           history_sql=None):
         """
         get linking between question and schema column
         """
@@ -305,8 +311,7 @@ class SchemaLinker:
             typeinfos = []
             for ii, column in enumerate(table['header_name']):
                 column = column.lower()
-                column_new = re.sub('(.*?)', '', column)
-                column_new = re.sub('（.*?）', '', column_new)
+                column_new = column
                 cphrase, cscore = self.get_match_phrase(
                     nlu.lower(), column_new)
                 if cscore > 0.3 and cphrase.strip() != '':
@@ -330,7 +335,6 @@ class SchemaLinker:
                 for cell in ans.keys():
                     vphrase = cell
                     vscore = 1.0
-                    # print("trie_set find:", cell, ans[cell])
                     phrase_tok = tokenizer.tokenize(vphrase)
                     if len(phrase_tok) == 0 or len(vphrase) < 2:
                         continue
@@ -408,16 +412,25 @@ class SchemaLinker:
             match_score = self.get_table_match_score(nlu_t, schema_link)
 
             search_result = {
-                'table_id': table['table_id'],
-                'question_knowledge': final_question,
-                'header_knowledge': final_header,
-                'schema_link': schema_link,
-                'match_score': match_score
+                'table_id':
+                table['table_id'],
+                'question_knowledge':
+                final_question,
+                'header_knowledge':
+                final_header,
+                'schema_link':
+                schema_link,
+                'match_score':
+                match_score,
+                'table_score':
+                int(table['table_id'] == history_sql['from'][0])
+                if history_sql is not None else 0
             }
             search_result_list.append(search_result)
 
         search_result_list = sorted(
-            search_result_list, key=lambda x: x['match_score'],
+            search_result_list,
+            key=lambda x: (x['match_score'], x['table_score']),
             reverse=True)[0:4]
 
         return search_result_list
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
index 163759a1..f98aa6d0 100644
--- a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -95,7 +95,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
 
         # tokenize question
         question = data['question']
-        history_sql = data['history_sql']
+        history_sql = data.get('history_sql', None)
         nlu = question.lower()
         nlu_t = self.tokenizer.tokenize(nlu)
 
@@ -105,7 +105,8 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
             nlu=nlu,
             nlu_t=nlu_t,
             tables=self.db.tables,
-            col_syn_dict=self.db.syn_dict)
+            col_syn_dict=self.db.syn_dict,
+            history_sql=history_sql)
 
         # collect data
         datas = self.construct_data(
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index eba12103..35b374f2 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -2,8 +2,7 @@ from typing import List
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline,
-                                      TableQuestionAnsweringPipeline)
+                                      DialogStateTrackingPipeline)
 
 
 def text2sql_tracking_and_print_results(
@@ -42,17 +41,3 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
-
-
-def tableqa_tracking_and_print_results(
-        test_case, pipelines: List[TableQuestionAnsweringPipeline]):
-    for pipeline in pipelines:
-        historical_queries = None
-        for question in test_case['utterance']:
-            output_dict = pipeline({
-                'question': question,
-                'history_sql': historical_queries
-            })
-            print('output_dict', output_dict['output'].string,
-                  output_dict['output'].query)
-            historical_queries = output_dict['history']
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 7ea28725..68e0564f 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import unittest
+from typing import List
 
 from transformers import BertTokenizer
 
@@ -11,10 +12,60 @@ from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
 from modelscope.preprocessors.star3.fields.database import Database
 from modelscope.utils.constant import ModelFile, Tasks
-from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
+def tableqa_tracking_and_print_results_with_history(
+        pipelines: List[TableQuestionAnsweringPipeline]):
+    test_case = {
+        'utterance': [
+            '有哪些风险类型？',
+            '风险类型有多少种？',
+            '珠江流域的小(2)型水库的库容总量是多少？',
+            '那平均值是多少？',
+            '那水库的名称呢？',
+            '换成中型的呢？',
+            '枣庄营业厅的电话',
+            '那地址呢？',
+            '枣庄营业厅的电话和地址',
+        ]
+    }
+    for p in pipelines:
+        historical_queries = None
+        for question in test_case['utterance']:
+            output_dict = p({
+                'question': question,
+                'history_sql': historical_queries
+            })
+            print('question', question)
+            print('sql text:', output_dict['output'].string)
+            print('sql query:', output_dict['output'].query)
+            print('query result:', output_dict['query_result'])
+            print()
+            historical_queries = output_dict['history']
+
+
+def tableqa_tracking_and_print_results_without_history(
+        pipelines: List[TableQuestionAnsweringPipeline]):
+    test_case = {
+        'utterance': [
+            '有哪些风险类型？',
+            '风险类型有多少种？',
+            '珠江流域的小(2)型水库的库容总量是多少？',
+            '枣庄营业厅的电话',
+            '枣庄营业厅的电话和地址',
+        ]
+    }
+    for p in pipelines:
+        for question in test_case['utterance']:
+            output_dict = p({'question': question})
+            print('question', question)
+            print('sql text:', output_dict['output'].string)
+            print('sql query:', output_dict['output'].query)
+            print('query result:', output_dict['query_result'])
+            print()
+
+
 class TableQuestionAnswering(unittest.TestCase):
 
     def setUp(self) -> None:
@@ -22,20 +73,18 @@ class TableQuestionAnswering(unittest.TestCase):
         self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
 
     model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
-    test_case = {
-        'utterance':
-        ['长江流域的小(2)型水库的库容总量是多少？', '那平均值是多少？', '那水库的名称呢？', '换成中型的呢？']
-    }
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=cache_path, preprocessor=preprocessor)
+            pipeline(
+                Tasks.table_question_answering,
+                model=cache_path,
+                preprocessor=preprocessor)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
@@ -43,15 +92,17 @@ class TableQuestionAnswering(unittest.TestCase):
         preprocessor = TableQuestionAnsweringPreprocessor(
             model_dir=model.model_dir)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=model, preprocessor=preprocessor)
+            pipeline(
+                Tasks.table_question_answering,
+                model=model,
+                preprocessor=preprocessor)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_task(self):
         pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub_with_other_classes(self):
@@ -60,15 +111,24 @@ class TableQuestionAnswering(unittest.TestCase):
             os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
         db = Database(
             tokenizer=self.tokenizer,
-            table_file_path=os.path.join(model.model_dir, 'table.json'),
-            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'))
+            table_file_path=[
+                os.path.join(model.model_dir, 'databases', fname)
+                for fname in os.listdir(
+                    os.path.join(model.model_dir, 'databases'))
+            ],
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'),
+            is_use_sqlite=True)
         preprocessor = TableQuestionAnsweringPreprocessor(
             model_dir=model.model_dir, db=db)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=model, preprocessor=preprocessor, db=db)
+            pipeline(
+                Tasks.table_question_answering,
+                model=model,
+                preprocessor=preprocessor,
+                db=db)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_without_history(pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
 
 if __name__ == '__main__':

From 3edf30caa60af9bab70f8aea4217a79581bb473c Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:19:12 +0800
Subject: [PATCH 19/57] [to #42322933]change the default model of face
 detection after discussion         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371469

---
 modelscope/pipelines/builder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index b18d4465..1f563915 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -118,8 +118,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.hand_2d_keypoints:
     (Pipelines.hand_2d_keypoints,
      'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
-    Tasks.face_detection: (Pipelines.face_detection,
-                           'damo/cv_resnet_facedetection_scrfd10gkps'),
+    Tasks.face_detection:
+    (Pipelines.face_detection,
+     'damo/cv_resnet101_face-detection_cvpr22papermogface'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
     Tasks.facial_expression_recognition:

From a26e6e38697a8795b99de4c7929b415baef78268 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 17:33:03 +0800
Subject: [PATCH 20/57] [to #45071449] fix setup error

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10196007
---
 modelscope/models/audio/tts/models/datasets/__init__.py | 0
 requirements/framework.txt                              | 1 +
 2 files changed, 1 insertion(+)
 mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100644
new mode 100755
diff --git a/requirements/framework.txt b/requirements/framework.txt
index b51faeda..aae200da 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -15,6 +15,7 @@ pyyaml
 requests
 scipy
 setuptools
+setuptools_scm
 tensorboard
 tqdm>=4.64.0
 yapf

From 8c91a4972e2ced9f1b40613ee88f4c5197bafa6e Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:01:34 +0800
Subject: [PATCH 21/57] require pai-easycv 0.6.3.7

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10380097
---
 requirements/cv.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cv.txt b/requirements/cv.txt
index e6ffb5ff..eb38beb1 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -17,7 +17,7 @@ mmdet>=2.25.0
 networkx>=2.5
 numba
 onnxruntime>=1.10
-pai-easycv>=0.6.3.6
+pai-easycv>=0.6.3.7
 pandas
 psutil
 regex

From 295fdd1a609def5e0c8b57783f1fca656e4cbcb0 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 12 Oct 2022 19:01:57 +0800
Subject: [PATCH 22/57] [to #45443331]fix: git config email with username bug

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10378571
---
 modelscope/hub/git.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 486f8df3..a149ede1 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -138,8 +138,8 @@ class GitCommandWrapper(metaclass=Singleton):
                 repo_base_dir, repo_name, user_name)
             response = self._run_git_command(*config_user_name_args.split(' '))
             logger.debug(response.stdout.decode('utf8'))
-            config_user_email_args = '-C %s/%s config user.name %s' % (
-                repo_base_dir, repo_name, user_name)
+            config_user_email_args = '-C %s/%s config user.email %s' % (
+                repo_base_dir, repo_name, user_email)
             response = self._run_git_command(
                 *config_user_email_args.split(' '))
             logger.debug(response.stdout.decode('utf8'))

From 4cb5f8a2cd104f89b765d56527d448b2df1be151 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:53:14 +0800
Subject: [PATCH 23/57] [to #42322933] add human whole body model and image
 object detection auto model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10319306

---
 data/test/images/auto_demo.jpg                |  3 +
 .../body_keypoints_detection.jpg              |  3 -
 .../keypoints_detect/img_test_wholebody.jpg   |  3 +
 modelscope/metainfo.py                        |  5 ++
 modelscope/models/cv/__init__.py              | 20 +++---
 .../cv/human_wholebody_keypoint/__init__.py   | 22 +++++++
 .../human_wholebody_keypoint.py               | 17 +++++
 .../models/cv/object_detection/__init__.py    |  2 +-
 .../models/cv/object_detection/yolox_pai.py   |  3 +
 .../cv/human_wholebody_keypoint/__init__.py   | 22 +++++++
 .../human_wholebody_keypoint_dataset.py       | 39 +++++++++++
 modelscope/outputs.py                         | 19 +++++-
 modelscope/pipelines/builder.py               |  8 ++-
 modelscope/pipelines/cv/__init__.py           | 11 +++-
 .../cv/body_2d_keypoints_pipeline.py          |  4 +-
 .../cv/body_3d_keypoints_pipeline.py          |  2 +-
 .../pipelines/cv/easycv_pipelines/__init__.py |  5 +-
 .../cv/easycv_pipelines/detection_pipeline.py | 41 +++++++++++-
 .../human_wholebody_keypoint_pipeline.py      | 65 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 modelscope/utils/cv/image_utils.py            | 34 +++++++++-
 .../test_human_wholebody_keypoint.py          | 40 ++++++++++++
 tests/pipelines/test_object_detection.py      | 12 ++++
 23 files changed, 353 insertions(+), 28 deletions(-)
 create mode 100644 data/test/images/auto_demo.jpg
 delete mode 100644 data/test/images/keypoints_detect/body_keypoints_detection.jpg
 create mode 100644 data/test/images/keypoints_detect/img_test_wholebody.jpg
 create mode 100644 modelscope/models/cv/human_wholebody_keypoint/__init__.py
 create mode 100644 modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
 create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
 create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
 create mode 100644 tests/pipelines/test_human_wholebody_keypoint.py

diff --git a/data/test/images/auto_demo.jpg b/data/test/images/auto_demo.jpg
new file mode 100644
index 00000000..30393e53
--- /dev/null
+++ b/data/test/images/auto_demo.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32
+size 53219
diff --git a/data/test/images/keypoints_detect/body_keypoints_detection.jpg b/data/test/images/keypoints_detect/body_keypoints_detection.jpg
deleted file mode 100644
index 71ce7d7e..00000000
--- a/data/test/images/keypoints_detect/body_keypoints_detection.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d
-size 1702339
diff --git a/data/test/images/keypoints_detect/img_test_wholebody.jpg b/data/test/images/keypoints_detect/img_test_wholebody.jpg
new file mode 100644
index 00000000..40a9f3f8
--- /dev/null
+++ b/data/test/images/keypoints_detect/img_test_wholebody.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd
+size 167407
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index cae9d188..759f1688 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -40,6 +40,7 @@ class Models(object):
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
     video_inpainting = 'video-inpainting'
+    human_wholebody_keypoint = 'human-wholebody-keypoint'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
@@ -49,6 +50,7 @@ class Models(object):
     # EasyCV models
     yolox = 'YOLOX'
     segformer = 'Segformer'
+    image_object_detection_auto = 'image-object-detection-auto'
 
     # nlp models
     bert = 'bert'
@@ -170,6 +172,7 @@ class Pipelines(object):
     ocr_recognition = 'convnextTiny-ocr-recognition'
     image_portrait_enhancement = 'gpen-image-portrait-enhancement'
     image_to_image_generation = 'image-to-image-generation'
+    image_object_detection_auto = 'yolox_image-object-detection-auto'
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
     tinynas_detection = 'tinynas-detection'
@@ -185,6 +188,7 @@ class Pipelines(object):
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
     video_inpainting = 'video-inpainting'
+    human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image'
     pst_action_recognition = 'patchshift-action-recognition'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
@@ -427,6 +431,7 @@ class Datasets(object):
     """
     ClsDataset = 'ClsDataset'
     Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index ba7b03c5..fd950f4c 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -4,15 +4,15 @@
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
                crowd_counting, face_2d_keypoints, face_detection,
-               face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_inpainting,
-               image_instance_segmentation, image_panoptic_segmentation,
-               image_portrait_enhancement, image_reid_person,
-               image_semantic_segmentation, image_to_image_generation,
-               image_to_image_translation, movie_scene_segmentation,
-               object_detection, product_retrieval_embedding,
-               realtime_object_detection, salient_detection, shop_segmentation,
-               super_resolution, video_single_object_tracking,
-               video_summarization, virual_tryon)
+               face_generation, human_wholebody_keypoint, image_classification,
+               image_color_enhance, image_colorization, image_denoise,
+               image_inpainting, image_instance_segmentation,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_semantic_segmentation,
+               image_to_image_generation, image_to_image_translation,
+               movie_scene_segmentation, object_detection,
+               product_retrieval_embedding, realtime_object_detection,
+               salient_detection, shop_segmentation, super_resolution,
+               video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/human_wholebody_keypoint/__init__.py b/modelscope/models/cv/human_wholebody_keypoint/__init__.py
new file mode 100644
index 00000000..30e23457
--- /dev/null
+++ b/modelscope/models/cv/human_wholebody_keypoint/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .human_wholebody_keypoint import HumanWholeBodyKeypoint
+
+else:
+    _import_structure = {
+        'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
new file mode 100644
index 00000000..dd3c0290
--- /dev/null
+++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
@@ -0,0 +1,17 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.pose.top_down import TopDown
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.human_wholebody_keypoint,
+    module_name=Models.human_wholebody_keypoint)
+class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        TopDown.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index 974375ce..0c782d7b 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
-        'yolox_pai': ['YOLOX']
+        'yolox_pai': ['YOLOX'],
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py
index 985cc136..46bd4e3c 100644
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks
 
 @MODELS.register_module(
     group_key=Tasks.image_object_detection, module_name=Models.yolox)
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Models.image_object_detection_auto)
 class YOLOX(EasyCVBaseModel, _YOLOX):
 
     def __init__(self, model_dir=None, *args, **kwargs):
diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
new file mode 100644
index 00000000..472ed2d8
--- /dev/null
+++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .human_wholebody_keypoint_dataset import WholeBodyCocoTopDownDataset
+
+else:
+    _import_structure = {
+        'human_wholebody_keypoint_dataset': ['WholeBodyCocoTopDownDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
new file mode 100644
index 00000000..fc9469f2
--- /dev/null
+++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.pose import \
+    WholeBodyCocoTopDownDataset as _WholeBodyCocoTopDownDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.human_wholebody_keypoint,
+    module_name=Datasets.HumanWholeBodyKeypointDataset)
+class WholeBodyCocoTopDownDataset(EasyCVBaseDataset,
+                                  _WholeBodyCocoTopDownDataset):
+    """EasyCV dataset for human whole body 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _WholeBodyCocoTopDownDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 0f353d3d..ab3ea54a 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -203,7 +203,7 @@ TASK_OUTPUTS = {
 
     # human body keypoints detection result for single sample
     # {
-    #   "poses": [
+    #   "keypoints": [
     #               [[x, y]*15],
     #               [[x, y]*15],
     #               [[x, y]*15]
@@ -220,7 +220,7 @@ TASK_OUTPUTS = {
     #             ]
     # }
     Tasks.body_2d_keypoints:
-    [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
+    [OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES],
 
     # 3D human body keypoints detection result for single sample
     # {
@@ -339,6 +339,21 @@ TASK_OUTPUTS = {
         OutputKeys.SCENE_META_LIST
     ],
 
+    # human whole body keypoints detection result for single sample
+    # {
+    #   "keypoints": [
+    #               [[x, y]*133],
+    #               [[x, y]*133],
+    #               [[x, y]*133]
+    #             ]
+    #   "boxes": [
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.human_wholebody_keypoint: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
+
     # video summarization result for a single video
     # {
     #        "output":
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 1f563915..bc9073bc 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -75,8 +75,6 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_bart_text-error-correction_chinese'),
     Tasks.image_captioning: (Pipelines.image_captioning,
                              'damo/ofa_image-caption_coco_large_en'),
-    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
-                                 'damo/cv_flow-based-body-reshaping_damo'),
     Tasks.image_portrait_stylization:
     (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
@@ -159,6 +157,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.image_classification:
     (Pipelines.daily_image_classification,
      'damo/cv_vit-base_image-classification_Dailylife-labels'),
+    Tasks.image_object_detection:
+    (Pipelines.image_object_detection_auto,
+     'damo/cv_yolox_image-object-detection-auto'),
     Tasks.ocr_recognition:
     (Pipelines.ocr_recognition,
      'damo/cv_convnextTiny_ocr-recognition-general_damo'),
@@ -186,6 +187,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                              'damo/cv_fft_inpainting_lama'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
+    Tasks.human_wholebody_keypoint:
+    (Pipelines.human_wholebody_keypoint,
+     'damo/cv_hrnetw48_human-wholebody-keypoint_image'),
     Tasks.hand_static: (Pipelines.hand_static,
                         'damo/cv_mobileface_hand-static'),
     Tasks.face_human_hand_detection:
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 118eaf17..f84f5fe5 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -46,7 +46,10 @@ if TYPE_CHECKING:
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .shop_segmentation_pipleline import ShopSegmentationPipeline
-    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
+    from .easycv_pipelines import (EasyCVDetectionPipeline,
+                                   EasyCVSegmentationPipeline,
+                                   Face2DKeypointsPipeline,
+                                   HumanWholebodyKeypointsPipeline)
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
     from .mog_face_detection_pipeline import MogFaceDetectionPipeline
@@ -109,8 +112,10 @@ else:
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
         'easycv_pipeline': [
-            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
-            'Face2DKeypointsPipeline'
+            'EasyCVDetectionPipeline',
+            'EasyCVSegmentationPipeline',
+            'Face2DKeypointsPipeline',
+            'HumanWholebodyKeypointsPipeline',
         ],
         'text_driven_segmentation_pipeline':
         ['TextDrivenSegmentationPipeline'],
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index d6afbae4..bc2e975d 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -73,7 +73,7 @@ class Body2DKeypointsPipeline(Pipeline):
         if input[0] is None or input[1] is None:
             return {
                 OutputKeys.BOXES: [],
-                OutputKeys.POSES: [],
+                OutputKeys.KEYPOINTS: [],
                 OutputKeys.SCORES: []
             }
 
@@ -83,7 +83,7 @@ class Body2DKeypointsPipeline(Pipeline):
             result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]])
         return {
             OutputKeys.BOXES: result_boxes,
-            OutputKeys.POSES: poses,
+            OutputKeys.KEYPOINTS: poses,
             OutputKeys.SCORES: scores
         }
 
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index c3f4e8c1..3502915c 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -145,7 +145,7 @@ class Body3DKeypointsPipeline(Pipeline):
             kps_2d = self.human_body_2d_kps_detector(frame)
             box = kps_2d['boxes'][
                 0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
-            pose = kps_2d['poses'][0]  # keypoints: [15, 2]
+            pose = kps_2d['keypoints'][0]  # keypoints: [15, 2]
             score = kps_2d['scores'][0]  # keypoints: [15, 2]
             all_2d_poses.append(pose)
             all_boxes_with_socre.append(
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
index 4f149130..e0209b85 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -7,11 +7,14 @@ if TYPE_CHECKING:
     from .detection_pipeline import EasyCVDetectionPipeline
     from .segmentation_pipeline import EasyCVSegmentationPipeline
     from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
+    from .human_wholebody_keypoint_pipeline import HumanWholebodyKeypointsPipeline
 else:
     _import_structure = {
         'detection_pipeline': ['EasyCVDetectionPipeline'],
         'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
-        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
+        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'],
+        'human_wholebody_keypoint_pipeline':
+        ['HumanWholebodyKeypointsPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
index 32365102..0c2058d5 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -1,16 +1,28 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
 from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.cv.image_utils import \
+    show_image_object_detection_auto_result
 from .base import EasyCVPipeline
 
 
 @PIPELINES.register_module(
     Tasks.image_object_detection, module_name=Pipelines.easycv_detection)
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.image_object_detection_auto)
 class EasyCVDetectionPipeline(EasyCVPipeline):
     """Pipeline for easycv detection task."""
 
-    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
         """
             model (str): model id on modelscope hub or local model path.
             model_file_pattern (str): model file pattern.
@@ -21,3 +33,28 @@ class EasyCVDetectionPipeline(EasyCVPipeline):
             model_file_pattern=model_file_pattern,
             *args,
             **kwargs)
+
+    def show_result(self, img_path, result, save_path=None):
+        show_image_object_detection_auto_result(img_path, result, save_path)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        scores = []
+        labels = []
+        boxes = []
+        for output in outputs:
+            for score, label, box in zip(output['detection_scores'],
+                                         output['detection_classes'],
+                                         output['detection_boxes']):
+                scores.append(score)
+                labels.append(self.cfg.CLASSES[label])
+                boxes.append([b for b in box])
+
+        results = [{
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: boxes
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
new file mode 100644
index 00000000..263f8225
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from typing import Any
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.human_wholebody_keypoint,
+    module_name=Pipelines.human_wholebody_keypoint)
+class HumanWholebodyKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for human wholebody 2d keypoints detection."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        self.model_dir = model
+        super(HumanWholebodyKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def _build_predict_op(self, **kwargs):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+        detection_predictor_type = self.cfg['DETECTION']['type']
+        detection_model_path = os.path.join(
+            self.model_dir, self.cfg['DETECTION']['model_path'])
+        detection_cfg_file = os.path.join(self.model_dir,
+                                          self.cfg['DETECTION']['config_file'])
+        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
+        self.cfg.pipeline.predictor_config[
+            'detection_predictor_config'] = dict(
+                type=detection_predictor_type,
+                model_path=detection_model_path,
+                config_file=detection_cfg_file,
+                score_threshold=detection_score_threshold)
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config,
+            **kwargs
+        })
+        return pipeline_op
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        results = [{
+            OutputKeys.KEYPOINTS: output['keypoints'],
+            OutputKeys.BOXES: output['boxes']
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2a5ac694..4fa3d766 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -29,6 +29,7 @@ class CVTasks(object):
     body_3d_keypoints = 'body-3d-keypoints'
     hand_2d_keypoints = 'hand-2d-keypoints'
     general_recognition = 'general-recognition'
+    human_wholebody_keypoint = 'human-wholebody-keypoint'
 
     image_classification = 'image-classification'
     image_multilabel_classification = 'image-multilabel-classification'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index eab74688..06a9bbaa 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -80,7 +80,7 @@ def realtime_object_detection_bbox_vis(image, bboxes):
 
 
 def draw_keypoints(output, original_image):
-    poses = np.array(output[OutputKeys.POSES])
+    poses = np.array(output[OutputKeys.KEYPOINTS])
     scores = np.array(output[OutputKeys.SCORES])
     boxes = np.array(output[OutputKeys.BOXES])
     assert len(poses) == len(scores) and len(poses) == len(boxes)
@@ -234,3 +234,35 @@ def show_video_summarization_result(video_in_path, result, video_save_path):
             video_writer.write(frame)
     video_writer.release()
     cap.release()
+
+
+def show_image_object_detection_auto_result(img_path,
+                                            detection_result,
+                                            save_path=None):
+    scores = detection_result[OutputKeys.SCORES]
+    labels = detection_result[OutputKeys.LABELS]
+    bboxes = detection_result[OutputKeys.BOXES]
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+
+    for (score, label, box) in zip(scores, labels, bboxes):
+        cv2.rectangle(img, (int(box[0]), int(box[1])),
+                      (int(box[2]), int(box[3])), (0, 0, 255), 2)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (int(box[0]), int(box[1])),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+        cv2.putText(
+            img,
+            label, (int((box[0] + box[2]) * 0.5), int(box[1])),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+
+    if save_path is not None:
+        cv2.imwrite(save_path, img)
+    return img
diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py
new file mode 100644
index 00000000..b214f4e1
--- /dev/null
+++ b/tests/pipelines/test_human_wholebody_keypoint.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_human_wholebody_keypoint(self):
+        img_path = 'data/test/images/keypoints_detect/img_test_wholebody.jpg'
+        model_id = 'damo/cv_hrnetw48_human-wholebody-keypoint_image'
+
+        human_wholebody_keypoint_pipeline = pipeline(
+            task=Tasks.human_wholebody_keypoint, model=model_id)
+        output = human_wholebody_keypoint_pipeline(img_path)[0]
+
+        output_keypoints = output[OutputKeys.KEYPOINTS]
+        output_pose = output[OutputKeys.BOXES]
+
+        human_wholebody_keypoint_pipeline.predict_op.show_result(
+            img_path,
+            output_keypoints,
+            output_pose,
+            scale=1,
+            save_path='human_wholebody_keypoint_ret.jpg')
+
+        for keypoint in output_keypoints:
+            self.assertEqual(keypoint.shape[0], 133)
+        for box in output_pose:
+            self.assertEqual(box.shape[0], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 2a74eb41..2cb217d9 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -59,6 +59,18 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_demo_compatibility(self):
         self.compatibility_check()
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_object_detection_auto_pipeline(self):
+        model_id = 'damo/cv_yolox_image-object-detection-auto'
+        test_image = 'data/test/images/auto_demo.jpg'
+
+        image_object_detection_auto = pipeline(
+            Tasks.image_object_detection, model=model_id)
+
+        result = image_object_detection_auto(test_image)[0]
+        image_object_detection_auto.show_result(test_image, result,
+                                                'auto_demo_ret.jpg')
+
 
 if __name__ == '__main__':
     unittest.main()

From 2989492bc08245ff02a71ac988b175d9e038d807 Mon Sep 17 00:00:00 2001
From: "yuxiang.tyx" <yuxiang.tyx@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:58:50 +0800
Subject: [PATCH 24/57] =?UTF-8?q?[to=20#42322933]=E6=9B=B4=E6=96=B0face=5F?=
 =?UTF-8?q?detection=5Fscrfd=E6=A8=A1=E5=9E=8B=E5=B9=B6=E6=94=AF=E6=8C=81f?=
 =?UTF-8?q?inetune,=20=E6=96=B0=E5=A2=9Ecard=5Fdetection=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 调整face_detection的文件层级(scrfd与其余新增face_detection方法平级);
2. 增加极大脸/旋转脸的检测方法，更新了新模型；
3. 支持读入数据集并finetune和eval；
4. 新增card_detection模型，支持读入datasethub数据集并finetune
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244540
---
 data/test/images/card_detection.jpg           |   3 +
 data/test/images/face_detection2.jpeg         |   3 +
 modelscope/metainfo.py                        |   3 +
 .../models/cv/face_detection/__init__.py      |   4 +-
 .../datasets/pipelines/transforms.py          | 189 -----
 .../cv/face_detection/scrfd/__init__.py       |   2 +
 .../{ => scrfd}/mmdet_patch/__init__.py       |   0
 .../{ => scrfd}/mmdet_patch/core/__init__.py  |   0
 .../mmdet_patch/core/bbox/__init__.py         |   0
 .../mmdet_patch/core/bbox/transforms.py       |   4 +-
 .../core/post_processing/__init__.py          |   0
 .../core/post_processing/bbox_nms.py          |   9 +-
 .../mmdet_patch/datasets/__init__.py          |   0
 .../datasets/pipelines/__init__.py            |   8 +-
 .../datasets/pipelines/auto_augment.py        | 271 +++++++
 .../datasets/pipelines/formating.py           | 113 +++
 .../mmdet_patch/datasets/pipelines/loading.py | 225 ++++++
 .../datasets/pipelines/transforms.py          | 737 ++++++++++++++++++
 .../mmdet_patch/datasets/retinaface.py        |   5 +-
 .../mmdet_patch/models/__init__.py            |   0
 .../mmdet_patch/models/backbones/__init__.py  |   0
 .../mmdet_patch/models/backbones/resnet.py    |   0
 .../models/dense_heads/__init__.py            |   0
 .../models/dense_heads/scrfd_head.py          |  11 +-
 .../mmdet_patch/models/detectors/__init__.py  |   0
 .../mmdet_patch/models/detectors/scrfd.py     | 108 ++-
 .../cv/face_detection/scrfd/scrfd_detect.py   |  71 ++
 modelscope/outputs.py                         |  19 +
 modelscope/pipelines/builder.py               |   4 +
 .../pipelines/cv/card_detection_pipeline.py   |  23 +
 .../pipelines/cv/face_detection_pipeline.py   |  39 +-
 .../pipelines/cv/face_recognition_pipeline.py |   2 +-
 .../cv/card_detection_scrfd_trainer.py        |  18 +
 .../cv/face_detection_scrfd_trainer.py        | 154 ++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  48 ++
 tests/pipelines/test_card_detection.py        |  66 ++
 tests/pipelines/test_face_detection.py        |  12 +-
 .../test_card_detection_scrfd_trainer.py      | 151 ++++
 .../test_face_detection_scrfd_trainer.py      | 150 ++++
 40 files changed, 2174 insertions(+), 279 deletions(-)
 create mode 100644 data/test/images/card_detection.jpg
 create mode 100644 data/test/images/face_detection2.jpeg
 delete mode 100755 modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/__init__.py
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/transforms.py (94%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/bbox_nms.py (89%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/pipelines/__init__.py (53%)
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
 create mode 100755 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/retinaface.py (97%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/resnet.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/scrfd_head.py (99%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/scrfd.py (50%)
 create mode 100644 modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
 create mode 100644 modelscope/pipelines/cv/card_detection_pipeline.py
 create mode 100644 modelscope/trainers/cv/card_detection_scrfd_trainer.py
 create mode 100644 modelscope/trainers/cv/face_detection_scrfd_trainer.py
 create mode 100644 tests/pipelines/test_card_detection.py
 create mode 100644 tests/trainers/test_card_detection_scrfd_trainer.py
 create mode 100644 tests/trainers/test_face_detection_scrfd_trainer.py

diff --git a/data/test/images/card_detection.jpg b/data/test/images/card_detection.jpg
new file mode 100644
index 00000000..86728c2c
--- /dev/null
+++ b/data/test/images/card_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c
+size 254845
diff --git a/data/test/images/face_detection2.jpeg b/data/test/images/face_detection2.jpeg
new file mode 100644
index 00000000..7f6025fa
--- /dev/null
+++ b/data/test/images/face_detection2.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae
+size 48909
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 759f1688..0917bf3e 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -148,6 +148,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    card_detection = 'resnet-card-detection-scrfd34gkps'
     ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
@@ -270,6 +271,8 @@ class Trainers(object):
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
     movie_scene_segmentation = 'movie-scene-segmentation'
+    face_detection_scrfd = 'face-detection-scrfd'
+    card_detection_scrfd = 'card-detection-scrfd'
     image_inpainting = 'image-inpainting'
 
     # nlp trainers
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index a2a845d2..27d1bd4c 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -8,12 +8,14 @@ if TYPE_CHECKING:
     from .mtcnn import MtcnnFaceDetector
     from .retinaface import RetinaFaceDetection
     from .ulfd_slim import UlfdFaceDetector
+    from .scrfd import ScrfdDetect
 else:
     _import_structure = {
         'ulfd_slim': ['UlfdFaceDetector'],
         'retinaface': ['RetinaFaceDetection'],
         'mtcnn': ['MtcnnFaceDetector'],
-        'mogface': ['MogFaceDetector']
+        'mogface': ['MogFaceDetector'],
+        'scrfd': ['ScrfdDetect']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
deleted file mode 100755
index 241f2c0e..00000000
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
-https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
-"""
-import numpy as np
-from mmdet.datasets.builder import PIPELINES
-from numpy import random
-
-
-@PIPELINES.register_module()
-class RandomSquareCrop(object):
-    """Random crop the image & bboxes, the cropped patches have minimum IoU
-    requirement with original image & bboxes, the IoU threshold is randomly
-    selected from min_ious.
-
-    Args:
-        min_ious (tuple): minimum IoU threshold for all intersections with
-        bounding boxes
-        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
-        where a >= min_crop_size).
-
-    Note:
-        The keys for bboxes, labels and masks should be paired. That is, \
-        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
-        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
-    """
-
-    def __init__(self,
-                 crop_ratio_range=None,
-                 crop_choice=None,
-                 bbox_clip_border=True):
-
-        self.crop_ratio_range = crop_ratio_range
-        self.crop_choice = crop_choice
-        self.bbox_clip_border = bbox_clip_border
-
-        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
-        if self.crop_ratio_range is not None:
-            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
-
-        self.bbox2label = {
-            'gt_bboxes': 'gt_labels',
-            'gt_bboxes_ignore': 'gt_labels_ignore'
-        }
-        self.bbox2mask = {
-            'gt_bboxes': 'gt_masks',
-            'gt_bboxes_ignore': 'gt_masks_ignore'
-        }
-
-    def __call__(self, results):
-        """Call function to crop images and bounding boxes with minimum IoU
-        constraint.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Result dict with images and bounding boxes cropped, \
-                'img_shape' key is updated.
-        """
-
-        if 'img_fields' in results:
-            assert results['img_fields'] == ['img'], \
-                'Only single img_fields is allowed'
-        img = results['img']
-        assert 'bbox_fields' in results
-        assert 'gt_bboxes' in results
-        boxes = results['gt_bboxes']
-        h, w, c = img.shape
-        scale_retry = 0
-        if self.crop_ratio_range is not None:
-            max_scale = self.crop_ratio_max
-        else:
-            max_scale = np.amax(self.crop_choice)
-        while True:
-            scale_retry += 1
-
-            if scale_retry == 1 or max_scale > 1.0:
-                if self.crop_ratio_range is not None:
-                    scale = np.random.uniform(self.crop_ratio_min,
-                                              self.crop_ratio_max)
-                elif self.crop_choice is not None:
-                    scale = np.random.choice(self.crop_choice)
-            else:
-                scale = scale * 1.2
-
-            for i in range(250):
-                short_side = min(w, h)
-                cw = int(scale * short_side)
-                ch = cw
-
-                # TODO +1
-                if w == cw:
-                    left = 0
-                elif w > cw:
-                    left = random.randint(0, w - cw)
-                else:
-                    left = random.randint(w - cw, 0)
-                if h == ch:
-                    top = 0
-                elif h > ch:
-                    top = random.randint(0, h - ch)
-                else:
-                    top = random.randint(h - ch, 0)
-
-                patch = np.array(
-                    (int(left), int(top), int(left + cw), int(top + ch)),
-                    dtype=np.int)
-
-                # center of boxes should inside the crop img
-                # only adjust boxes and instance masks when the gt is not empty
-                # adjust boxes
-                def is_center_of_bboxes_in_patch(boxes, patch):
-                    # TODO >=
-                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
-                    mask = \
-                        ((center[:, 0] > patch[0])
-                         * (center[:, 1] > patch[1])
-                         * (center[:, 0] < patch[2])
-                         * (center[:, 1] < patch[3]))
-                    return mask
-
-                mask = is_center_of_bboxes_in_patch(boxes, patch)
-                if not mask.any():
-                    continue
-                for key in results.get('bbox_fields', []):
-                    boxes = results[key].copy()
-                    mask = is_center_of_bboxes_in_patch(boxes, patch)
-                    boxes = boxes[mask]
-                    if self.bbox_clip_border:
-                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
-                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
-                    boxes -= np.tile(patch[:2], 2)
-
-                    results[key] = boxes
-                    # labels
-                    label_key = self.bbox2label.get(key)
-                    if label_key in results:
-                        results[label_key] = results[label_key][mask]
-
-                    # keypoints field
-                    if key == 'gt_bboxes':
-                        for kps_key in results.get('keypoints_fields', []):
-                            keypointss = results[kps_key].copy()
-                            keypointss = keypointss[mask, :, :]
-                            if self.bbox_clip_border:
-                                keypointss[:, :, :
-                                           2] = keypointss[:, :, :2].clip(
-                                               max=patch[2:])
-                                keypointss[:, :, :
-                                           2] = keypointss[:, :, :2].clip(
-                                               min=patch[:2])
-                            keypointss[:, :, 0] -= patch[0]
-                            keypointss[:, :, 1] -= patch[1]
-                            results[kps_key] = keypointss
-
-                    # mask fields
-                    mask_key = self.bbox2mask.get(key)
-                    if mask_key in results:
-                        results[mask_key] = results[mask_key][mask.nonzero()
-                                                              [0]].crop(patch)
-
-                # adjust the img no matter whether the gt is empty before crop
-                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
-                patch_from = patch.copy()
-                patch_from[0] = max(0, patch_from[0])
-                patch_from[1] = max(0, patch_from[1])
-                patch_from[2] = min(img.shape[1], patch_from[2])
-                patch_from[3] = min(img.shape[0], patch_from[3])
-                patch_to = patch.copy()
-                patch_to[0] = max(0, patch_to[0] * -1)
-                patch_to[1] = max(0, patch_to[1] * -1)
-                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
-                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
-                rimg[patch_to[1]:patch_to[3],
-                     patch_to[0]:patch_to[2], :] = img[
-                         patch_from[1]:patch_from[3],
-                         patch_from[0]:patch_from[2], :]
-                img = rimg
-                results['img'] = img
-                results['img_shape'] = img.shape
-
-                return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(min_ious={self.min_iou}, '
-        repr_str += f'crop_size={self.crop_size})'
-        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/__init__.py b/modelscope/models/cv/face_detection/scrfd/__init__.py
new file mode 100644
index 00000000..92f81f7a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .scrfd_detect import ScrfdDetect
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
similarity index 94%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
index d65480eb..75e32d85 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 
 
-def bbox2result(bboxes, labels, num_classes, kps=None):
+def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5):
     """Convert detection results to a list of numpy arrays.
 
     Args:
@@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None):
     Returns:
         list(ndarray): bbox results of each class
     """
-    bbox_len = 5 if kps is None else 5 + 10  # if has kps, add 10 kps into bbox
+    bbox_len = 5 if kps is None else 5 + num_kps * 2  # if has kps, add num_kps*2 into bbox
     if bboxes.shape[0] == 0:
         return [
             np.zeros((0, bbox_len), dtype=np.float32)
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
similarity index 89%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
index 7a4f5b3a..697b7338 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
@@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes,
 
     Args:
         multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2)
         multi_scores (Tensor): shape (n, #class), where the last column
             contains scores of the background class, but this will be ignored.
         score_thr (float): bbox threshold, bboxes with scores lower than it
@@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes,
     num_classes = multi_scores.size(1) - 1
     # exclude background category
     kps = None
+    if multi_kps is not None:
+        num_kps = int((multi_kps.shape[1] / num_classes) / 2)
     if multi_bboxes.shape[1] > 4:
         bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
         if multi_kps is not None:
-            kps = multi_kps.view(multi_scores.size(0), -1, 10)
+            kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2)
     else:
         bboxes = multi_bboxes[:, None].expand(
             multi_scores.size(0), num_classes, 4)
         if multi_kps is not None:
             kps = multi_kps[:, None].expand(
-                multi_scores.size(0), num_classes, 10)
+                multi_scores.size(0), num_classes, num_kps * 2)
 
     scores = multi_scores[:, :-1]
     if score_factors is not None:
@@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes,
 
     bboxes = bboxes.reshape(-1, 4)
     if kps is not None:
-        kps = kps.reshape(-1, 10)
+        kps = kps.reshape(-1, num_kps * 2)
     scores = scores.reshape(-1)
     labels = labels.reshape(-1)
 
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
similarity index 53%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
index 85288910..a2cafd1a 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
@@ -2,6 +2,12 @@
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
 """
+from .auto_augment import RotateV2
+from .formating import DefaultFormatBundleV2
+from .loading import LoadAnnotationsV2
 from .transforms import RandomSquareCrop
 
-__all__ = ['RandomSquareCrop']
+__all__ = [
+    'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2',
+    'DefaultFormatBundleV2'
+]
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
new file mode 100644
index 00000000..ee60c2e0
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
@@ -0,0 +1,271 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
+"""
+import copy
+
+import cv2
+import mmcv
+import numpy as np
+from mmdet.datasets.builder import PIPELINES
+
+_MAX_LEVEL = 10
+
+
+def level_to_value(level, max_value):
+    """Map from level to values based on max_value."""
+    return (level / _MAX_LEVEL) * max_value
+
+
+def random_negative(value, random_negative_prob):
+    """Randomly negate value based on random_negative_prob."""
+    return -value if np.random.rand() < random_negative_prob else value
+
+
+def bbox2fields():
+    """The key correspondence from bboxes to labels, masks and
+    segmentations."""
+    bbox2label = {
+        'gt_bboxes': 'gt_labels',
+        'gt_bboxes_ignore': 'gt_labels_ignore'
+    }
+    bbox2mask = {
+        'gt_bboxes': 'gt_masks',
+        'gt_bboxes_ignore': 'gt_masks_ignore'
+    }
+    bbox2seg = {
+        'gt_bboxes': 'gt_semantic_seg',
+    }
+    return bbox2label, bbox2mask, bbox2seg
+
+
+@PIPELINES.register_module()
+class RotateV2(object):
+    """Apply Rotate Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range (0,_MAX_LEVEL].
+        scale (int | float): Isotropic scale factor. Same in
+            ``mmcv.imrotate``.
+        center (int | float | tuple[float]): Center point (w, h) of the
+            rotation in the source image. If None, the center of the
+            image will be used. Same in ``mmcv.imrotate``.
+        img_fill_val (int | float | tuple): The fill value for image border.
+            If float, the same value will be used for all the three
+            channels of image. If tuple, the should be 3 elements (e.g.
+            equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1.
+        max_rotate_angle (int | float): The maximum angles for rotate
+            transformation.
+        random_negative_prob (float): The probability that turns the
+             offset negative.
+    """
+
+    def __init__(self,
+                 level,
+                 scale=1,
+                 center=None,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 max_rotate_angle=30,
+                 random_negative_prob=0.5):
+        assert isinstance(level, (int, float)), \
+            f'The level must be type int or float. got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
+        assert isinstance(scale, (int, float)), \
+            f'The scale must be type int or float. got type {type(scale)}.'
+        if isinstance(center, (int, float)):
+            center = (center, center)
+        elif isinstance(center, tuple):
+            assert len(center) == 2, 'center with type tuple must have '\
+                f'2 elements. got {len(center)} elements.'
+        else:
+            assert center is None, 'center must be None or type int, '\
+                f'float or tuple, got type {type(center)}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255]. '\
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            f'got {prob}.'
+        assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
+            f'should be type int or float. got type {type(max_rotate_angle)}.'
+        self.level = level
+        self.scale = scale
+        # Rotation angle in degrees. Positive values mean
+        # clockwise rotation.
+        self.angle = level_to_value(level, max_rotate_angle)
+        self.center = center
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.max_rotate_angle = max_rotate_angle
+        self.random_negative_prob = random_negative_prob
+
+    def _rotate_img(self, results, angle, center=None, scale=1.0):
+        """Rotate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            angle (float): Rotation angle in degrees, positive values
+                mean clockwise rotation. Same in ``mmcv.imrotate``.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation. Same in ``mmcv.imrotate``.
+            scale (int | float): Isotropic scale factor. Same in
+                ``mmcv.imrotate``.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            img_rotated = mmcv.imrotate(
+                img, angle, center, scale, border_value=self.img_fill_val)
+            results[key] = img_rotated.astype(img.dtype)
+            results['img_shape'] = results[key].shape
+
+    def _rotate_bboxes(self, results, rotate_matrix):
+        """Rotate the bboxes."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_bbox, 1]
+            # pad 1 to convert from format [x, y] to homogeneous
+            # coordinates format [x, y, 1]
+            coordinates = np.concatenate(
+                (coordinates,
+                 np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
+                axis=1)  # [4, 3, nb_bbox, 1]
+            coordinates = coordinates.transpose(
+                (2, 0, 1, 3))  # [nb_bbox, 4, 3, 1]
+            rotated_coords = np.matmul(rotate_matrix,
+                                       coordinates)  # [nb_bbox, 4, 2, 1]
+            rotated_coords = rotated_coords[..., 0]  # [nb_bbox, 4, 2]
+            min_x, min_y = np.min(
+                rotated_coords[:, :, 0], axis=1), np.min(
+                    rotated_coords[:, :, 1], axis=1)
+            max_x, max_y = np.max(
+                rotated_coords[:, :, 0], axis=1), np.max(
+                    rotated_coords[:, :, 1], axis=1)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _rotate_keypoints90(self, results, angle):
+        """Rotate the keypoints, only valid when angle in [-90,90,-180,180]"""
+        if angle not in [-90, 90, 180, -180
+                         ] or self.scale != 1 or self.center is not None:
+            return
+        for key in results.get('keypoints_fields', []):
+            k = results[key]
+            if angle == 90:
+                w, h, c = results['img'].shape
+                new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1)
+            elif angle == -90:
+                w, h, c = results['img'].shape
+                new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1)
+            else:
+                h, w, c = results['img'].shape
+                new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]],
+                               axis=-1)
+            # a kps is invalid if thrid value is -1
+            kps_invalid = new[..., -1][:, -1] == -1
+            new[kps_invalid] = np.zeros(new.shape[1:]) - 1
+            results[key] = new
+
+    def _rotate_masks(self,
+                      results,
+                      angle,
+                      center=None,
+                      scale=1.0,
+                      fill_val=0):
+        """Rotate the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
+
+    def _rotate_seg(self,
+                    results,
+                    angle,
+                    center=None,
+                    scale=1.0,
+                    fill_val=255):
+        """Rotate the segmentation map."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imrotate(
+                seg, angle, center, scale,
+                border_value=fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after rotate
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to rotate images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        h, w = results['img'].shape[:2]
+        center = self.center
+        if center is None:
+            center = ((w - 1) * 0.5, (h - 1) * 0.5)
+        angle = random_negative(self.angle, self.random_negative_prob)
+        self._rotate_img(results, angle, center, self.scale)
+        rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
+        self._rotate_bboxes(results, rotate_matrix)
+        self._rotate_keypoints90(results, angle)
+        self._rotate_masks(results, angle, center, self.scale, fill_val=0)
+        self._rotate_seg(
+            results, angle, center, self.scale, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'center={self.center}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
new file mode 100644
index 00000000..bd2394a8
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
@@ -0,0 +1,113 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py
+"""
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from mmdet.datasets.builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundleV2(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss',
+                'gt_labels'
+        ]:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32),
+                std=np.ones(num_channels, dtype=np.float32),
+                to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
new file mode 100644
index 00000000..b4c2a385
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
@@ -0,0 +1,225 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py
+"""
+import os.path as osp
+
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmdet.core import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadAnnotationsV2(object):
+    """Load mutiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_keypoints (bool): Whether to parse and load the keypoints annotation.
+            Default: False.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_keypoints=False,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 file_client_args=dict(backend='disk')):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_keypoints = with_keypoints
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+        return results
+
+    def _load_keypoints(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_keypointss'] = ann_info['keypointss'].copy()
+
+        results['keypoints_fields'] = ['gt_keypointss']
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results['gt_labels'] = results['ann_info']['labels'].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks(
+                [self.process_polygons(polygons) for polygons in gt_masks], h,
+                w)
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        import mmcv
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        results['gt_semantic_seg'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_keypoints:
+            results = self._load_keypoints(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_keypoints={self.with_keypoints}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg})'
+        repr_str += f'poly2mask={self.poly2mask})'
+        repr_str += f'poly2mask={self.file_client_args})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
new file mode 100755
index 00000000..270c34da
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
@@ -0,0 +1,737 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
+"""
+import mmcv
+import numpy as np
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.datasets.builder import PIPELINES
+from numpy import random
+
+
+@PIPELINES.register_module()
+class ResizeV2(object):
+    """Resize images & bbox & mask &kps.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and uper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_keypoints(self, results):
+        """Resize keypoints with ``results['scale_factor']``."""
+        for key in results.get('keypoints_fields', []):
+            keypointss = results[key].copy()
+            factors = results['scale_factor']
+            assert factors[0] == factors[2]
+            assert factors[1] == factors[3]
+            keypointss[:, :, 0] *= factors[0]
+            keypointss[:, :, 1] *= factors[1]
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0,
+                                              img_shape[1])
+                keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0,
+                                              img_shape[0])
+            results[key] = keypointss
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_semantic_seg'] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_keypoints(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio})'
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlipV2(object):
+    """Flip the image & bbox & mask & kps.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image wil
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+         of 0.3, vertically with probability of 0.5
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert mmcv.is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmcv.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+        self.count = 0
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def keypoints_flip(self, keypointss, img_shape, direction):
+        """Flip keypoints horizontally."""
+
+        assert direction == 'horizontal'
+        assert keypointss.shape[-1] == 3
+        num_kps = keypointss.shape[1]
+        assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}'
+        assert keypointss.ndim == 3
+        flipped = keypointss.copy()
+        if num_kps == 5:
+            flip_order = [1, 0, 2, 4, 3]
+        elif num_kps == 4:
+            flip_order = [3, 2, 1, 0]
+        for idx, a in enumerate(flip_order):
+            flipped[:, idx, :] = keypointss[:, a, :]
+        w = img_shape[1]
+        flipped[..., 0] = w - flipped[..., 0]
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list)
+                                                    - 1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip kps
+            for key in results.get('keypoints_fields', []):
+                results[key] = self.keypoints_flip(results[key],
+                                                   results['img_shape'],
+                                                   results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomSquareCrop(object):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 crop_ratio_range=None,
+                 crop_choice=None,
+                 bbox_clip_border=True,
+                 big_face_ratio=0,
+                 big_face_crop_choice=None):
+
+        self.crop_ratio_range = crop_ratio_range
+        self.crop_choice = crop_choice
+        self.big_face_crop_choice = big_face_crop_choice
+        self.bbox_clip_border = bbox_clip_border
+
+        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
+        if self.crop_ratio_range is not None:
+            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
+
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+        assert big_face_ratio >= 0 and big_face_ratio <= 1.0
+        self.big_face_ratio = big_face_ratio
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        assert 'gt_bboxes' in results
+        # try augment big face images
+        find_bigface = False
+        if np.random.random() < self.big_face_ratio:
+            min_size = 100  # h and w
+            expand_ratio = 0.3  # expand ratio of croped face alongwith both w and h
+            bbox = results['gt_bboxes'].copy()
+            lmks = results['gt_keypointss'].copy()
+            label = results['gt_labels'].copy()
+            # filter small faces
+            size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * (
+                (bbox[:, 3] - bbox[:, 1]) > min_size)
+            bbox = bbox[size_mask]
+            lmks = lmks[size_mask]
+            label = label[size_mask]
+            # randomly choose a face that has no overlap with others
+            if len(bbox) > 0:
+                overlaps = bbox_overlaps(bbox, bbox)
+                overlaps -= np.eye(overlaps.shape[0])
+                iou_mask = np.sum(overlaps, axis=1) == 0
+                bbox = bbox[iou_mask]
+                lmks = lmks[iou_mask]
+                label = label[iou_mask]
+                if len(bbox) > 0:
+                    choice = np.random.randint(len(bbox))
+                    bbox = bbox[choice]
+                    lmks = lmks[choice]
+                    label = [label[choice]]
+                    w = bbox[2] - bbox[0]
+                    h = bbox[3] - bbox[1]
+                    x1 = bbox[0] - w * expand_ratio
+                    x2 = bbox[2] + w * expand_ratio
+                    y1 = bbox[1] - h * expand_ratio
+                    y2 = bbox[3] + h * expand_ratio
+                    x1, x2 = np.clip([x1, x2], 0, img.shape[1])
+                    y1, y2 = np.clip([y1, y2], 0, img.shape[0])
+                    bbox -= np.tile([x1, y1], 2)
+                    lmks -= (x1, y1, 0)
+
+                    find_bigface = True
+                    img = img[int(y1):int(y2), int(x1):int(x2), :]
+                    results['gt_bboxes'] = np.expand_dims(bbox, axis=0)
+                    results['gt_keypointss'] = np.expand_dims(lmks, axis=0)
+                    results['gt_labels'] = np.array(label)
+                    results['img'] = img
+
+        boxes = results['gt_bboxes']
+        h, w, c = img.shape
+
+        if self.crop_ratio_range is not None:
+            max_scale = self.crop_ratio_max
+        else:
+            max_scale = np.amax(self.crop_choice)
+        scale_retry = 0
+        while True:
+            scale_retry += 1
+            if scale_retry == 1 or max_scale > 1.0:
+                if self.crop_ratio_range is not None:
+                    scale = np.random.uniform(self.crop_ratio_min,
+                                              self.crop_ratio_max)
+                elif self.crop_choice is not None:
+                    scale = np.random.choice(self.crop_choice)
+            else:
+                scale = scale * 1.2
+
+            if find_bigface:
+                # select a scale from big_face_crop_choice if in big_face mode
+                scale = np.random.choice(self.big_face_crop_choice)
+
+            for i in range(250):
+                long_side = max(w, h)
+                cw = int(scale * long_side)
+                ch = cw
+
+                # TODO +1
+                if w == cw:
+                    left = 0
+                elif w > cw:
+                    left = random.randint(0, w - cw)
+                else:
+                    left = random.randint(w - cw, 0)
+                if h == ch:
+                    top = 0
+                elif h > ch:
+                    top = random.randint(0, h - ch)
+                else:
+                    top = random.randint(h - ch, 0)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + cw), int(top + ch)),
+                    dtype=np.int32)
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                # adjust boxes
+                def is_center_of_bboxes_in_patch(boxes, patch):
+                    # TODO >=
+                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                    mask = \
+                        ((center[:, 0] > patch[0])
+                         * (center[:, 1] > patch[1])
+                         * (center[:, 0] < patch[2])
+                         * (center[:, 1] < patch[3]))
+                    return mask
+
+                mask = is_center_of_bboxes_in_patch(boxes, patch)
+                if not mask.any():
+                    continue
+                for key in results.get('bbox_fields', []):
+                    boxes = results[key].copy()
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    boxes = boxes[mask]
+                    if self.bbox_clip_border:
+                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                    boxes -= np.tile(patch[:2], 2)
+
+                    results[key] = boxes
+                    # labels
+                    label_key = self.bbox2label.get(key)
+                    if label_key in results:
+                        results[label_key] = results[label_key][mask]
+
+                    # keypoints field
+                    if key == 'gt_bboxes':
+                        for kps_key in results.get('keypoints_fields', []):
+                            keypointss = results[kps_key].copy()
+                            keypointss = keypointss[mask, :, :]
+                            if self.bbox_clip_border:
+                                keypointss[:, :, :
+                                           2] = keypointss[:, :, :2].clip(
+                                               max=patch[2:])
+                                keypointss[:, :, :
+                                           2] = keypointss[:, :, :2].clip(
+                                               min=patch[:2])
+                            keypointss[:, :, 0] -= patch[0]
+                            keypointss[:, :, 1] -= patch[1]
+                            results[kps_key] = keypointss
+
+                    # mask fields
+                    mask_key = self.bbox2mask.get(key)
+                    if mask_key in results:
+                        results[mask_key] = results[mask_key][mask.nonzero()
+                                                              [0]].crop(patch)
+
+                # adjust the img no matter whether the gt is empty before crop
+                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
+                patch_from = patch.copy()
+                patch_from[0] = max(0, patch_from[0])
+                patch_from[1] = max(0, patch_from[1])
+                patch_from[2] = min(img.shape[1], patch_from[2])
+                patch_from[3] = min(img.shape[0], patch_from[3])
+                patch_to = patch.copy()
+                patch_to[0] = max(0, patch_to[0] * -1)
+                patch_to[1] = max(0, patch_to[1] * -1)
+                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
+                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
+                rimg[patch_to[1]:patch_to[3],
+                     patch_to[0]:patch_to[2], :] = img[
+                         patch_from[1]:patch_from[3],
+                         patch_from[0]:patch_from[2], :]
+                img = rimg
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_iou}, '
+        repr_str += f'crop_size={self.crop_size})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
similarity index 97%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
index bbacd9be..40c440b9 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
@@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset):
     CLASSES = ('FG', )
 
     def __init__(self, min_size=None, **kwargs):
-        self.NK = 5
+        self.NK = kwargs.pop('num_kps', 5)
         self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
         self.min_size = min_size
         self.gt_path = kwargs.get('gt_path')
@@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset):
         if len(values) > 4:
             if len(values) > 5:
                 kps = np.array(
-                    values[4:19], dtype=np.float32).reshape((self.NK, 3))
+                    values[4:4 + self.NK * 3], dtype=np.float32).reshape(
+                        (self.NK, 3))
                 for li in range(kps.shape[0]):
                     if (kps[li, :] == -1).all():
                         kps[li][2] = 0.0  # weight = 0, ignore
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
similarity index 99%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
index acc45670..77ec99cf 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead):
                  scale_mode=1,
                  dw_conv=False,
                  use_kps=False,
+                 num_kps=5,
                  loss_kps=dict(
                      type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
                  **kwargs):
@@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead):
         self.scale_mode = scale_mode
         self.use_dfl = True
         self.dw_conv = dw_conv
-        self.NK = 5
+        self.NK = num_kps
         self.extra_flops = 0.0
         if loss_dfl is None or not loss_dfl:
             self.use_dfl = False
@@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead):
                 batch_size, -1, self.cls_out_channels).sigmoid()
             bbox_pred = bbox_pred.permute(0, 2, 3,
                                           1).reshape(batch_size, -1, 4)
-            kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)
-
+            kps_pred = kps_pred.permute(0, 2, 3,
+                                        1).reshape(batch_size, -1, self.NK * 2)
         return cls_score, bbox_pred, kps_pred
 
     def forward_train(self,
@@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead):
                 if self.use_dfl:
                     kps_pred = self.integral(kps_pred) * stride[0]
                 else:
-                    kps_pred = kps_pred.reshape((-1, 10)) * stride[0]
+                    kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0]
 
             nms_pre = cfg.get('nms_pre', -1)
             if nms_pre > 0 and scores.shape[0] > nms_pre:
@@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead):
             mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
             if mlvl_kps is not None:
                 scale_factor2 = torch.tensor(
-                    [scale_factor[0], scale_factor[1]] * 5)
+                    [scale_factor[0], scale_factor[1]] * self.NK)
                 mlvl_kps /= scale_factor2.to(mlvl_kps.device)
 
         mlvl_scores = torch.cat(mlvl_scores)
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
similarity index 50%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
index a5f5cac2..18b46be1 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
@@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector):
                                               gt_bboxes_ignore)
         return losses
 
-    def simple_test(self, img, img_metas, rescale=False):
+    def simple_test(self,
+                    img,
+                    img_metas,
+                    rescale=False,
+                    repeat_head=1,
+                    output_kps_var=0,
+                    output_results=1):
         """Test function without test time augmentation.
 
         Args:
@@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector):
             img_metas (list[dict]): List of image information.
             rescale (bool, optional): Whether to rescale the results.
                 Defaults to False.
+            repeat_head (int): repeat inference times in head
+            output_kps_var (int): whether output kps var to calculate quality
+            output_results (int): 0: nothing  1: bbox  2: both bbox and kps
 
         Returns:
             list[list[np.ndarray]]: BBox results of each image and classes.
@@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector):
                 corresponds to each class.
         """
         x = self.extract_feat(img)
-        outs = self.bbox_head(x)
-        if torch.onnx.is_in_onnx_export():
-            print('single_stage.py in-onnx-export')
-            print(outs.__class__)
-            cls_score, bbox_pred, kps_pred = outs
-            for c in cls_score:
-                print(c.shape)
-            for c in bbox_pred:
-                print(c.shape)
-            if self.bbox_head.use_kps:
-                for c in kps_pred:
-                    print(c.shape)
-                return (cls_score, bbox_pred, kps_pred)
-            else:
-                return (cls_score, bbox_pred)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
+        assert repeat_head >= 1
+        kps_out0 = []
+        kps_out1 = []
+        kps_out2 = []
+        for i in range(repeat_head):
+            outs = self.bbox_head(x)
+            kps_out0 += [outs[2][0].detach().cpu().numpy()]
+            kps_out1 += [outs[2][1].detach().cpu().numpy()]
+            kps_out2 += [outs[2][2].detach().cpu().numpy()]
+        if output_kps_var:
+            var0 = np.var(np.vstack(kps_out0), axis=0).mean()
+            var1 = np.var(np.vstack(kps_out1), axis=0).mean()
+            var2 = np.var(np.vstack(kps_out2), axis=0).mean()
+            var = np.mean([var0, var1, var2])
+        else:
+            var = None
 
-        # return kps if use_kps
-        if len(bbox_list[0]) == 2:
-            bbox_results = [
-                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
-                for det_bboxes, det_labels in bbox_list
-            ]
-        elif len(bbox_list[0]) == 3:
-            bbox_results = [
-                bbox2result(
-                    det_bboxes,
-                    det_labels,
-                    self.bbox_head.num_classes,
-                    kps=det_kps)
-                for det_bboxes, det_labels, det_kps in bbox_list
-            ]
-        return bbox_results
+        if output_results > 0:
+            if torch.onnx.is_in_onnx_export():
+                print('single_stage.py in-onnx-export')
+                print(outs.__class__)
+                cls_score, bbox_pred, kps_pred = outs
+                for c in cls_score:
+                    print(c.shape)
+                for c in bbox_pred:
+                    print(c.shape)
+                if self.bbox_head.use_kps:
+                    for c in kps_pred:
+                        print(c.shape)
+                    return (cls_score, bbox_pred, kps_pred)
+                else:
+                    return (cls_score, bbox_pred)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, img_metas, rescale=rescale)
+
+            # return kps if use_kps
+            if len(bbox_list[0]) == 2:
+                bbox_results = [
+                    bbox2result(det_bboxes, det_labels,
+                                self.bbox_head.num_classes)
+                    for det_bboxes, det_labels in bbox_list
+                ]
+            elif len(bbox_list[0]) == 3:
+                if output_results == 2:
+                    bbox_results = [
+                        bbox2result(
+                            det_bboxes,
+                            det_labels,
+                            self.bbox_head.num_classes,
+                            kps=det_kps,
+                            num_kps=self.bbox_head.NK)
+                        for det_bboxes, det_labels, det_kps in bbox_list
+                    ]
+                elif output_results == 1:
+                    bbox_results = [
+                        bbox2result(det_bboxes, det_labels,
+                                    self.bbox_head.num_classes)
+                        for det_bboxes, det_labels, _ in bbox_list
+                    ]
+        else:
+            bbox_results = None
+        if var is not None:
+            return bbox_results, var
+        else:
+            return bbox_results
 
     def feature_test(self, img):
         x = self.extract_feat(img)
diff --git a/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
new file mode 100644
index 00000000..59611604
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from copy import deepcopy
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ScrfdDetect']
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd)
+class ScrfdDetect(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the face detection model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        from mmcv import Config
+        from mmcv.parallel import MMDataParallel
+        from mmcv.runner import load_checkpoint
+        from mmdet.models import build_detector
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
+        cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py'))
+        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
+        detector = build_detector(cfg.model)
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        load_checkpoint(detector, ckpt_path, map_location=device)
+        detector = MMDataParallel(detector, device_ids=[0])
+        detector.eval()
+        self.detector = detector
+        logger.info('load model done')
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(
+            return_loss=False,
+            rescale=True,
+            img=[input['img'][0].unsqueeze(0)],
+            img_metas=[[dict(input['img_metas'][0].data)]],
+            output_results=2)
+        assert result is not None
+        result = result[0][0]
+        bboxes = result[:, :4].tolist()
+        kpss = result[:, 5:].tolist()
+        scores = result[:, 4].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: kpss
+        }
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index ab3ea54a..3001c03c 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -90,6 +90,25 @@ TASK_OUTPUTS = {
     Tasks.face_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # card detection result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #       "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ],
+    #       "keypoints": [
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #       ],
+    #   }
+    Tasks.card_detection:
+    [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
+
     # facial expression recognition result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index bc9073bc..174d10b1 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -116,6 +116,10 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.hand_2d_keypoints:
     (Pipelines.hand_2d_keypoints,
      'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
+    Tasks.face_detection: (Pipelines.face_detection,
+                           'damo/cv_resnet_facedetection_scrfd10gkps'),
+    Tasks.card_detection: (Pipelines.card_detection,
+                           'damo/cv_resnet_carddetection_scrfd34gkps'),
     Tasks.face_detection:
     (Pipelines.face_detection,
      'damo/cv_resnet101_face-detection_cvpr22papermogface'),
diff --git a/modelscope/pipelines/cv/card_detection_pipeline.py b/modelscope/pipelines/cv/card_detection_pipeline.py
new file mode 100644
index 00000000..00b18024
--- /dev/null
+++ b/modelscope/pipelines/cv/card_detection_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.cv.face_detection_pipeline import \
+    FaceDetectionPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.card_detection, module_name=Pipelines.card_detection)
+class CardDetectionPipeline(FaceDetectionPipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a card detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        thr = 0.45  # card/face detect use different threshold
+        super().__init__(model=model, score_thr=thr, **kwargs)
diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py
index eff5b70f..608567a4 100644
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -8,6 +8,7 @@ import PIL
 import torch
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import ScrfdDetect
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -29,27 +30,8 @@ class FaceDetectionPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        from mmcv import Config
-        from mmcv.parallel import MMDataParallel
-        from mmcv.runner import load_checkpoint
-        from mmdet.models import build_detector
-        from modelscope.models.cv.face_detection.mmdet_patch.datasets import RetinaFaceDataset
-        from modelscope.models.cv.face_detection.mmdet_patch.datasets.pipelines import RandomSquareCrop
-        from modelscope.models.cv.face_detection.mmdet_patch.models.backbones import ResNetV1e
-        from modelscope.models.cv.face_detection.mmdet_patch.models.dense_heads import SCRFDHead
-        from modelscope.models.cv.face_detection.mmdet_patch.models.detectors import SCRFD
-        cfg = Config.fromfile(osp.join(model, 'mmcv_scrfd_10g_bnkps.py'))
-        detector = build_detector(
-            cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
-        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
-        logger.info(f'loading model from {ckpt_path}')
-        device = torch.device(
-            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
-        load_checkpoint(detector, ckpt_path, map_location=device)
-        detector = MMDataParallel(detector, device_ids=[0])
-        detector.eval()
+        detector = ScrfdDetect(model_dir=model, **kwargs)
         self.detector = detector
-        logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
         img = LoadImage.convert_to_ndarray(input)
@@ -85,22 +67,7 @@ class FaceDetectionPipeline(Pipeline):
         return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-
-        result = self.detector(
-            return_loss=False,
-            rescale=True,
-            img=[input['img'][0].unsqueeze(0)],
-            img_metas=[[dict(input['img_metas'][0].data)]])
-        assert result is not None
-        result = result[0][0]
-        bboxes = result[:, :4].tolist()
-        kpss = result[:, 5:].tolist()
-        scores = result[:, 4].tolist()
-        return {
-            OutputKeys.SCORES: scores,
-            OutputKeys.BOXES: bboxes,
-            OutputKeys.KEYPOINTS: kpss
-        }
+        return self.detector(input)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 873e4a1f..abae69d4 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -49,7 +49,7 @@ class FaceRecognitionPipeline(Pipeline):
         # face detect pipeline
         det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
         self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
+            Tasks.face_detection, model=det_model_id, model_revision='v2')
 
     def _choose_face(self,
                      det_result,
diff --git a/modelscope/trainers/cv/card_detection_scrfd_trainer.py b/modelscope/trainers/cv/card_detection_scrfd_trainer.py
new file mode 100644
index 00000000..e1f81bcf
--- /dev/null
+++ b/modelscope/trainers/cv/card_detection_scrfd_trainer.py
@@ -0,0 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.cv.face_detection_scrfd_trainer import \
+    FaceDetectionScrfdTrainer
+
+
+@TRAINERS.register_module(module_name=Trainers.card_detection_scrfd)
+class CardDetectionScrfdTrainer(FaceDetectionScrfdTrainer):
+
+    def __init__(self, cfg_file: str, *args, **kwargs):
+        """ High-level finetune api for SCRFD.
+
+        Args:
+            cfg_file: Path to configuration file.
+        """
+        # card/face dataset use different img folder names
+        super().__init__(cfg_file, imgdir_name='', **kwargs)
diff --git a/modelscope/trainers/cv/face_detection_scrfd_trainer.py b/modelscope/trainers/cv/face_detection_scrfd_trainer.py
new file mode 100644
index 00000000..9cfae7dd
--- /dev/null
+++ b/modelscope/trainers/cv/face_detection_scrfd_trainer.py
@@ -0,0 +1,154 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import os
+import os.path as osp
+import time
+from typing import Callable, Dict, Optional
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+
+
+@TRAINERS.register_module(module_name=Trainers.face_detection_scrfd)
+class FaceDetectionScrfdTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: str,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+        """ High-level finetune api for SCRFD.
+
+        Args:
+            cfg_file: Path to configuration file.
+            cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+        """
+        import mmcv
+        from mmcv.runner import get_dist_info, init_dist
+        from mmcv.utils import get_git_hash
+        from mmdet.utils import collect_env, get_root_logger
+        from mmdet.apis import set_random_seed
+        from mmdet.models import build_detector
+        from mmdet.datasets import build_dataset
+        from mmdet import __version__
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import DefaultFormatBundleV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import LoadAnnotationsV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RotateV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
+        super().__init__(cfg_file)
+        cfg = self.cfg
+        if 'work_dir' in kwargs:
+            cfg.work_dir = kwargs['work_dir']
+        else:
+            # use config filename as default work_dir if work_dir is None
+            cfg.work_dir = osp.join('./work_dirs',
+                                    osp.splitext(osp.basename(cfg_file))[0])
+        mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+
+        if 'resume_from' in kwargs:  # pretrain model for finetune
+            cfg.resume_from = kwargs['resume_from']
+        cfg.device = 'cuda'
+        if 'gpu_ids' in kwargs:
+            cfg.gpu_ids = kwargs['gpu_ids']
+        else:
+            cfg.gpu_ids = range(1)
+        labelfile_name = kwargs.pop('labelfile_name', 'labelv2.txt')
+        imgdir_name = kwargs.pop('imgdir_name', 'images/')
+        if 'train_root' in kwargs:
+            cfg.data.train.ann_file = kwargs['train_root'] + labelfile_name
+            cfg.data.train.img_prefix = kwargs['train_root'] + imgdir_name
+        if 'val_root' in kwargs:
+            cfg.data.val.ann_file = kwargs['val_root'] + labelfile_name
+            cfg.data.val.img_prefix = kwargs['val_root'] + imgdir_name
+        if 'total_epochs' in kwargs:
+            cfg.total_epochs = kwargs['total_epochs']
+        if cfg_modify_fn is not None:
+            cfg = cfg_modify_fn(cfg)
+        if 'launcher' in kwargs:
+            distributed = True
+            init_dist(kwargs['launcher'], **cfg.dist_params)
+            # re-set gpu_ids with distributed training mode
+            _, world_size = get_dist_info()
+            cfg.gpu_ids = range(world_size)
+        else:
+            distributed = False
+        # no_validate=True will not evaluate checkpoint during training
+        cfg.no_validate = kwargs.get('no_validate', False)
+        # init the logger before other steps
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+        logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+        # init the meta dict to record some important information such as
+        # environment info and seed, which will be logged
+        meta = dict()
+        # log env info
+        env_info_dict = collect_env()
+        env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+        dash_line = '-' * 60 + '\n'
+        logger.info('Environment info:\n' + dash_line + env_info + '\n'
+                    + dash_line)
+        meta['env_info'] = env_info
+        meta['config'] = cfg.pretty_text
+        # log some basic info
+        logger.info(f'Distributed training: {distributed}')
+        logger.info(f'Config:\n{cfg.pretty_text}')
+
+        # set random seeds
+        if 'seed' in kwargs:
+            cfg.seed = kwargs['seed']
+            _deterministic = kwargs.get('deterministic', False)
+            logger.info(f'Set random seed to {kwargs["seed"]}, '
+                        f'deterministic: {_deterministic}')
+            set_random_seed(kwargs['seed'], deterministic=_deterministic)
+        else:
+            cfg.seed = None
+        meta['seed'] = cfg.seed
+        meta['exp_name'] = osp.basename(cfg_file)
+
+        model = build_detector(cfg.model)
+        model.init_weights()
+        datasets = [build_dataset(cfg.data.train)]
+        if len(cfg.workflow) == 2:
+            val_dataset = copy.deepcopy(cfg.data.val)
+            val_dataset.pipeline = cfg.data.train.pipeline
+            datasets.append(build_dataset(val_dataset))
+        if cfg.checkpoint_config is not None:
+            # save mmdet version, config file content and class names in
+            # checkpoints as meta data
+            cfg.checkpoint_config.meta = dict(
+                mmdet_version=__version__ + get_git_hash()[:7],
+                CLASSES=datasets[0].CLASSES)
+        # add an attribute for visualization convenience
+        model.CLASSES = datasets[0].CLASSES
+
+        self.cfg = cfg
+        self.datasets = datasets
+        self.model = model
+        self.distributed = distributed
+        self.timestamp = timestamp
+        self.meta = meta
+        self.logger = logger
+
+    def train(self, *args, **kwargs):
+        from mmdet.apis import train_detector
+        train_detector(
+            self.model,
+            self.datasets,
+            self.cfg,
+            distributed=self.distributed,
+            validate=(not self.cfg.no_validate),
+            timestamp=self.timestamp,
+            meta=self.meta)
+
+    def evaluate(self,
+                 checkpoint_path: str = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        cfg = self.cfg.evaluation
+        logger.info(f'eval cfg {cfg}')
+        logger.info(f'checkpoint_path {checkpoint_path}')
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4fa3d766..5f0532ce 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -19,6 +19,7 @@ class CVTasks(object):
     # human face body related
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
+    card_detection = 'card-detection'
     face_recognition = 'face-recognition'
     facial_expression_recognition = 'facial-expression-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 06a9bbaa..2d420892 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -154,6 +154,54 @@ def draw_face_detection_result(img_path, detection_result):
     return img
 
 
+def draw_card_detection_result(img_path, detection_result):
+
+    def warp_img(src_img, kps, ratio):
+        short_size = 500
+        if ratio > 1:
+            obj_h = short_size
+            obj_w = int(obj_h * ratio)
+        else:
+            obj_w = short_size
+            obj_h = int(obj_w / ratio)
+        input_pts = np.float32([kps[0], kps[1], kps[2], kps[3]])
+        output_pts = np.float32([[0, obj_h - 1], [0, 0], [obj_w - 1, 0],
+                                 [obj_w - 1, obj_h - 1]])
+        M = cv2.getPerspectiveTransform(input_pts, output_pts)
+        obj_img = cv2.warpPerspective(src_img, M, (obj_w, obj_h))
+        return obj_img
+
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img_list = []
+    ver_col = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)]
+    img = cv2.imread(img_path)
+    img_list += [img]
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        kps = kpss[i].reshape(-1, 2).astype(np.int32)
+        _w = (kps[0][0] - kps[3][0])**2 + (kps[0][1] - kps[3][1])**2
+        _h = (kps[0][0] - kps[1][0])**2 + (kps[0][1] - kps[1][1])**2
+        ratio = 1.59 if _w >= _h else 1 / 1.59
+        card_img = warp_img(img, kps, ratio)
+        img_list += [card_img]
+        score = scores[i]
+        x1, y1, x2, y2 = bbox
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 4)
+        for k, kp in enumerate(kps):
+            cv2.circle(img, tuple(kp), 1, color=ver_col[k], thickness=10)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    return img_list
+
+
 def created_boxed_image(image_in, box):
     image = load_image(image_in)
     img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
diff --git a/tests/pipelines/test_card_detection.py b/tests/pipelines/test_card_detection.py
new file mode 100644
index 00000000..d913f494
--- /dev/null
+++ b/tests/pipelines/test_card_detection.py
@@ -0,0 +1,66 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_card_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class CardDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.card_detection
+        self.model_id = 'damo/cv_resnet_carddetection_scrfd34gkps'
+
+    def show_result(self, img_path, detection_result):
+        img_list = draw_card_detection_result(img_path, detection_result)
+        for i, img in enumerate(img_list):
+            if i == 0:
+                cv2.imwrite('result.jpg', img_list[0])
+                print(
+                    f'Found {len(img_list)-1} cards, output written to {osp.abspath("result.jpg")}'
+                )
+            else:
+                cv2.imwrite(f'card_{i}.jpg', img_list[i])
+                save_path = osp.abspath(f'card_{i}.jpg')
+                print(f'detect card_{i}: {save_path}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_dataset(self):
+        input_location = ['data/test/images/card_detection.jpg']
+
+        dataset = MsDataset.load(input_location, target='image')
+        card_detection = pipeline(Tasks.card_detection, model=self.model_id)
+        # note that for dataset output, the inference-output is a Generator that can be iterated.
+        result = card_detection(dataset)
+        result = next(result)
+        self.show_result(input_location[0], result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        card_detection = pipeline(Tasks.card_detection, model=self.model_id)
+        img_path = 'data/test/images/card_detection.jpg'
+
+        result = card_detection(img_path)
+        self.show_result(img_path, result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        card_detection = pipeline(Tasks.card_detection)
+        img_path = 'data/test/images/card_detection.jpg'
+        result = card_detection(img_path)
+        self.show_result(img_path, result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index f89e9a94..31ae403e 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -25,10 +25,11 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_dataset(self):
-        input_location = ['data/test/images/face_detection.png']
+        input_location = ['data/test/images/face_detection2.jpeg']
 
         dataset = MsDataset.load(input_location, target='image')
-        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        face_detection = pipeline(
+            Tasks.face_detection, model=self.model_id, model_revision='v2')
         # note that for dataset output, the inference-output is a Generator that can be iterated.
         result = face_detection(dataset)
         result = next(result)
@@ -36,8 +37,9 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
-        img_path = 'data/test/images/face_detection.png'
+        face_detection = pipeline(
+            Tasks.face_detection, model=self.model_id, model_revision='v2')
+        img_path = 'data/test/images/face_detection2.jpeg'
 
         result = face_detection(img_path)
         self.show_result(img_path, result)
@@ -45,7 +47,7 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         face_detection = pipeline(Tasks.face_detection)
-        img_path = 'data/test/images/face_detection.png'
+        img_path = 'data/test/images/face_detection2.jpeg'
         result = face_detection(img_path)
         self.show_result(img_path, result)
 
diff --git a/tests/trainers/test_card_detection_scrfd_trainer.py b/tests/trainers/test_card_detection_scrfd_trainer.py
new file mode 100644
index 00000000..af87000b
--- /dev/null
+++ b/tests/trainers/test_card_detection_scrfd_trainer.py
@@ -0,0 +1,151 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+
+
+def _setup():
+    model_id = 'damo/cv_resnet_carddetection_scrfd34gkps'
+    # mini dataset only for unit test, remove '_mini' for full dataset.
+    ms_ds_syncards = MsDataset.load(
+        'SyntheticCards_mini', namespace='shaoxuan')
+
+    data_path = ms_ds_syncards.config_kwargs['split_config']
+    train_dir = data_path['train']
+    val_dir = data_path['validation']
+    train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/'
+    val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+    max_epochs = 1  # run epochs in unit test
+
+    cache_path = snapshot_download(model_id)
+
+    tmp_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    return train_root, val_root, max_epochs, cache_path, tmp_dir
+
+
+def train_func(**kwargs):
+    trainer = build_trainer(
+        name=Trainers.card_detection_scrfd, default_args=kwargs)
+    trainer.train()
+
+
+class TestCardDetectionScrfdTrainerSingleGPU(unittest.TestCase):
+
+    def setUp(self):
+        print(('SingleGPU Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def _cfg_modify_fn(self, cfg):
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4  # batch size
+        return cfg
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.card_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.card_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class TestCardDetectionScrfdTrainerMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        print(('MultiGPUs Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+        cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py')
+        cfg = Config.from_file(cfg_file_path)
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4
+        cfg.dump(cfg_file_path)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            launcher='pytorch')
+        self.start(train_func, num_gpus=2, **kwargs)
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_face_detection_scrfd_trainer.py b/tests/trainers/test_face_detection_scrfd_trainer.py
new file mode 100644
index 00000000..eb9440ef
--- /dev/null
+++ b/tests/trainers/test_face_detection_scrfd_trainer.py
@@ -0,0 +1,150 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+
+
+def _setup():
+    model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+    # mini dataset only for unit test, remove '_mini' for full dataset.
+    ms_ds_widerface = MsDataset.load('WIDER_FACE_mini', namespace='shaoxuan')
+
+    data_path = ms_ds_widerface.config_kwargs['split_config']
+    train_dir = data_path['train']
+    val_dir = data_path['validation']
+    train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/'
+    val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+    max_epochs = 1  # run epochs in unit test
+
+    cache_path = snapshot_download(model_id, revision='v2')
+
+    tmp_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    return train_root, val_root, max_epochs, cache_path, tmp_dir
+
+
+def train_func(**kwargs):
+    trainer = build_trainer(
+        name=Trainers.face_detection_scrfd, default_args=kwargs)
+    trainer.train()
+
+
+class TestFaceDetectionScrfdTrainerSingleGPU(unittest.TestCase):
+
+    def setUp(self):
+        print(('SingleGPU Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def _cfg_modify_fn(self, cfg):
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4  # batch size
+        return cfg
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.face_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.face_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class TestFaceDetectionScrfdTrainerMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        print(('MultiGPUs Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+        cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py')
+        cfg = Config.from_file(cfg_file_path)
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4
+        cfg.dump(cfg_file_path)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            launcher='pytorch')
+        self.start(train_func, num_gpus=2, **kwargs)
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3863efc14d6da1786a93e7652d949d8d55ae8624 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:15:33 +0800
Subject: [PATCH 25/57] [to #42322933] add far field KWS trainer         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10275823

---
 data/test/audios/noise_2ch.wav                |   3 +
 .../test/audios/wake_word_with_label_xyxy.wav |   3 +
 modelscope/metainfo.py                        |   1 +
 modelscope/models/audio/kws/farfield/model.py |  63 ++--
 .../task_datasets/audio/__init__.py           |  21 ++
 .../audio/kws_farfield_dataset.py             | 280 ++++++++++++++++++
 .../trainers/audio/kws_farfield_trainer.py    | 279 +++++++++++++++++
 modelscope/utils/audio/audio_utils.py         |  18 ++
 requirements/audio.txt                        |   6 +-
 .../audio/test_kws_farfield_trainer.py        |  85 ++++++
 10 files changed, 721 insertions(+), 38 deletions(-)
 create mode 100644 data/test/audios/noise_2ch.wav
 create mode 100644 data/test/audios/wake_word_with_label_xyxy.wav
 create mode 100644 modelscope/msdatasets/task_datasets/audio/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
 create mode 100644 modelscope/trainers/audio/kws_farfield_trainer.py
 create mode 100644 tests/trainers/audio/test_kws_farfield_trainer.py

diff --git a/data/test/audios/noise_2ch.wav b/data/test/audios/noise_2ch.wav
new file mode 100644
index 00000000..c754e39a
--- /dev/null
+++ b/data/test/audios/noise_2ch.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d
+size 2327764
diff --git a/data/test/audios/wake_word_with_label_xyxy.wav b/data/test/audios/wake_word_with_label_xyxy.wav
new file mode 100644
index 00000000..b7999777
--- /dev/null
+++ b/data/test/audios/wake_word_with_label_xyxy.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150
+size 68524
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 0917bf3e..46c3b138 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -285,6 +285,7 @@ class Trainers(object):
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
 
 
 class Preprocessors(object):
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index fea82194..d63d1e2a 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,15 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import Dict
-
-import torch
+from typing import Dict, Optional
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.constant import Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2
 
 
@@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel):
 
     MODEL_TXT = 'model.txt'
     SC_CONFIG = 'sound_connect.conf'
-    SC_CONF_ITEM_KWS_MODEL = '${kws_model}'
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 training: Optional[bool] = False,
+                 *args,
+                 **kwargs):
         """initialize the dfsmn model from the `model_dir` path.
 
         Args:
             model_dir (str): the model path.
         """
         super().__init__(model_dir, *args, **kwargs)
-        sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
-        model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
-        model_bin_file = os.path.join(model_dir,
-                                      ModelFile.TORCH_MODEL_BIN_FILE)
-        self._model = None
-        if os.path.exists(model_bin_file):
-            kwargs.pop('device')
-            self._model = FSMNSeleNetV2(*args, **kwargs)
-            checkpoint = torch.load(model_bin_file)
-            self._model.load_state_dict(checkpoint, strict=False)
-
-        self._sc = None
-        if os.path.exists(model_txt_file):
-            with open(sc_config_file) as f:
-                lines = f.readlines()
-            with open(sc_config_file, 'w') as f:
-                for line in lines:
-                    if self.SC_CONF_ITEM_KWS_MODEL in line:
-                        line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
-                                            model_txt_file)
-                    f.write(line)
-            import py_sound_connect
-            self._sc = py_sound_connect.SoundConnect(sc_config_file)
-            self.size_in = self._sc.bytesPerBlockIn()
-            self.size_out = self._sc.bytesPerBlockOut()
-
-        if self._model is None and self._sc is None:
-            raise Exception(
-                f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
-            )
+        if training:
+            self.model = FSMNSeleNetV2(*args, **kwargs)
+        else:
+            sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
+            model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
+            self._sc = None
+            if os.path.exists(model_txt_file):
+                conf_dict = dict(mode=56542, kws_model=model_txt_file)
+                update_conf(sc_config_file, sc_config_file, conf_dict)
+                import py_sound_connect
+                self._sc = py_sound_connect.SoundConnect(sc_config_file)
+                self.size_in = self._sc.bytesPerBlockIn()
+                self.size_out = self._sc.bytesPerBlockOut()
+            else:
+                raise Exception(
+                    f'Invalid model directory! Failed to load model file: {model_txt_file}.'
+                )
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        ...
+        return self.model.forward(input)
 
     def forward_decode(self, data: bytes):
         result = {'pcm': self._sc.process(data, self.size_out)}
diff --git a/modelscope/msdatasets/task_datasets/audio/__init__.py b/modelscope/msdatasets/task_datasets/audio/__init__.py
new file mode 100644
index 00000000..c62a8d9c
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .kws_farfield_dataset import KWSDataset, KWSDataLoader
+
+else:
+    _import_structure = {
+        'kws_farfield_dataset': ['KWSDataset', 'KWSDataLoader'],
+    }
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
new file mode 100644
index 00000000..8c518ec9
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
@@ -0,0 +1,280 @@
+"""
+Used to prepare simulated data.
+"""
+import math
+import os.path
+import queue
+import threading
+import time
+
+import numpy as np
+import torch
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+BLOCK_DEC = 2
+BLOCK_CAT = 3
+FBANK_SIZE = 40
+LABEL_SIZE = 1
+LABEL_GAIN = 100.0
+
+
+class KWSDataset:
+    """
+    dataset for keyword spotting and vad
+    conf_basetrain:         basetrain configure file path
+    conf_finetune:          finetune configure file path, null allowed
+    numworkers:             no. of workers
+    basetrainratio:         basetrain workers ratio
+    numclasses:             no. of nn output classes, 2 classes to generate vad label
+    blockdec:               block decimation
+    blockcat:               block concatenation
+    """
+
+    def __init__(self,
+                 conf_basetrain,
+                 conf_finetune,
+                 numworkers,
+                 basetrainratio,
+                 numclasses,
+                 blockdec=BLOCK_CAT,
+                 blockcat=BLOCK_CAT):
+        super().__init__()
+        self.numclasses = numclasses
+        self.blockdec = blockdec
+        self.blockcat = blockcat
+        self.sims_base = []
+        self.sims_senior = []
+        self.setup_sims(conf_basetrain, conf_finetune, numworkers,
+                        basetrainratio)
+
+    def release(self):
+        for sim in self.sims_base:
+            del sim
+        for sim in self.sims_senior:
+            del sim
+        del self.base_conf
+        del self.senior_conf
+        logger.info('KWSDataset: Released.')
+
+    def setup_sims(self, conf_basetrain, conf_finetune, numworkers,
+                   basetrainratio):
+        if not os.path.exists(conf_basetrain):
+            raise ValueError(f'{conf_basetrain} does not exist!')
+        if not os.path.exists(conf_finetune):
+            raise ValueError(f'{conf_finetune} does not exist!')
+        import py_sound_connect
+        logger.info('KWSDataset init SoundConnect...')
+        num_base = math.ceil(numworkers * basetrainratio)
+        num_senior = numworkers - num_base
+        # hold by fields to avoid python releasing conf object
+        self.base_conf = py_sound_connect.ConfigFile(conf_basetrain)
+        self.senior_conf = py_sound_connect.ConfigFile(conf_finetune)
+        for i in range(num_base):
+            fs = py_sound_connect.FeatSimuKWS(self.base_conf.params)
+            self.sims_base.append(fs)
+        for i in range(num_senior):
+            self.sims_senior.append(
+                py_sound_connect.FeatSimuKWS(self.senior_conf.params))
+        logger.info('KWSDataset init SoundConnect finished.')
+
+    def getBatch(self, id):
+        """
+        Generate a data batch
+
+        Args:
+            id: worker id
+
+        Return: time x channel x feature, label
+        """
+        fs = self.get_sim(id)
+        fs.processBatch()
+        # get multi-channel feature vector size
+        featsize = fs.featSize()
+        # get label vector size
+        labelsize = fs.labelSize()
+        # get minibatch size (time dimension)
+        # batchsize = fs.featBatchSize()
+        # no. of fe output channels
+        numchs = featsize // FBANK_SIZE
+        # get raw data
+        fs_feat = fs.feat()
+        data = np.frombuffer(fs_feat, dtype='float32')
+        data = data.reshape((-1, featsize + labelsize))
+
+        # convert float label to int
+        label = data[:, FBANK_SIZE * numchs:]
+
+        if self.numclasses == 2:
+            # generate vad label
+            label[label > 0.0] = 1.0
+        else:
+            # generate kws label
+            label = np.round(label * LABEL_GAIN)
+            label[label > self.numclasses - 1] = 0.0
+
+        # decimated size
+        size1 = int(np.ceil(
+            label.shape[0] / self.blockdec)) - self.blockcat + 1
+
+        # label decimation
+        label1 = np.zeros((size1, LABEL_SIZE), dtype='float32')
+        for tau in range(size1):
+            label1[tau, :] = label[(tau + self.blockcat // 2)
+                                   * self.blockdec, :]
+
+        # feature decimation and concatenation
+        # time x channel x feature
+        featall = np.zeros((size1, numchs, FBANK_SIZE * self.blockcat),
+                           dtype='float32')
+        for n in range(numchs):
+            feat = data[:, FBANK_SIZE * n:FBANK_SIZE * (n + 1)]
+
+            for tau in range(size1):
+                for i in range(self.blockcat):
+                    featall[tau, n, FBANK_SIZE * i:FBANK_SIZE * (i + 1)] = \
+                        feat[(tau + i) * self.blockdec, :]
+
+        return torch.from_numpy(featall), torch.from_numpy(label1).long()
+
+    def get_sim(self, id):
+        num_base = len(self.sims_base)
+        if id < num_base:
+            fs = self.sims_base[id]
+        else:
+            fs = self.sims_senior[id - num_base]
+        return fs
+
+
+class Worker(threading.Thread):
+    """
+    id:                 worker id
+    dataset:            the dataset
+    pool:               queue as the global data buffer
+    """
+
+    def __init__(self, id, dataset, pool):
+        threading.Thread.__init__(self)
+
+        self.id = id
+        self.dataset = dataset
+        self.pool = pool
+        self.isrun = True
+        self.nn = 0
+
+    def run(self):
+        while self.isrun:
+            self.nn += 1
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:1')
+            # get simulated minibatch
+            if self.isrun:
+                data = self.dataset.getBatch(self.id)
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:2')
+
+            # put data into buffer
+            if self.isrun:
+                self.pool.put(data)
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:3')
+
+        logger.info('KWSDataLoader: Worker {:02d} stopped.'.format(self.id))
+
+    def stopWorker(self):
+        """
+        stop the worker thread
+        """
+        self.isrun = False
+
+
+class KWSDataLoader:
+    """
+    dataset:            the dataset reference
+    batchsize:          data batch size
+    numworkers:         no. of workers
+    prefetch:           prefetch factor
+    """
+
+    def __init__(self, dataset, batchsize, numworkers, prefetch=2):
+        self.dataset = dataset
+        self.batchsize = batchsize
+        self.datamap = {}
+        self.isrun = True
+
+        # data queue
+        self.pool = queue.Queue(batchsize * prefetch)
+
+        # initialize workers
+        self.workerlist = []
+        for id in range(numworkers):
+            w = Worker(id, dataset, self.pool)
+            self.workerlist.append(w)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while self.isrun:
+            # get data from common data pool
+            data = self.pool.get()
+            self.pool.task_done()
+
+            # group minibatches with the same shape
+            key = str(data[0].shape)
+
+            batchl = self.datamap.get(key)
+            if batchl is None:
+                batchl = []
+                self.datamap.update({key: batchl})
+
+            batchl.append(data)
+
+            # a full data batch collected
+            if len(batchl) >= self.batchsize:
+                featbatch = []
+                labelbatch = []
+
+                for feat, label in batchl:
+                    featbatch.append(feat)
+                    labelbatch.append(label)
+
+                batchl.clear()
+
+                feattensor = torch.stack(featbatch, dim=0)
+                labeltensor = torch.stack(labelbatch, dim=0)
+
+                if feattensor.shape[-2] == 1:
+                    logger.debug('KWSDataLoader: Basetrain batch.')
+                else:
+                    logger.debug('KWSDataLoader: Finetune batch.')
+
+                return feattensor, labeltensor
+
+        return None, None
+
+    def start(self):
+        """
+        start multi-thread data loader
+        """
+        for w in self.workerlist:
+            w.start()
+
+    def stop(self):
+        """
+        stop data loader
+        """
+        logger.info('KWSDataLoader: Stopping...')
+        self.isrun = False
+
+        for w in self.workerlist:
+            w.stopWorker()
+
+        while not self.pool.empty():
+            self.pool.get(block=True, timeout=0.001)
+
+        # wait workers terminated
+        for w in self.workerlist:
+            while not self.pool.empty():
+                self.pool.get(block=True, timeout=0.001)
+            w.join()
+        logger.info('KWSDataLoader: All worker stopped.')
diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py
new file mode 100644
index 00000000..a720ced5
--- /dev/null
+++ b/modelscope/trainers/audio/kws_farfield_trainer.py
@@ -0,0 +1,279 @@
+import datetime
+import math
+import os
+from typing import Callable, Dict, Optional
+
+import numpy as np
+import torch
+from torch import nn as nn
+from torch import optim as optim
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models import Model, TorchModel
+from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.device import create_device
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, is_master)
+
+logger = get_logger()
+
+BASETRAIN_CONF_EASY = 'basetrain_easy'
+BASETRAIN_CONF_NORMAL = 'basetrain_normal'
+BASETRAIN_CONF_HARD = 'basetrain_hard'
+FINETUNE_CONF_EASY = 'finetune_easy'
+FINETUNE_CONF_NORMAL = 'finetune_normal'
+FINETUNE_CONF_HARD = 'finetune_hard'
+
+EASY_RATIO = 0.1
+NORMAL_RATIO = 0.6
+HARD_RATIO = 0.3
+BASETRAIN_RATIO = 0.5
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_dfsmn_kws_char_farfield)
+class KWSFarfieldTrainer(BaseTrainer):
+    DEFAULT_WORK_DIR = './work_dir'
+    conf_keys = (BASETRAIN_CONF_EASY, FINETUNE_CONF_EASY,
+                 BASETRAIN_CONF_NORMAL, FINETUNE_CONF_NORMAL,
+                 BASETRAIN_CONF_HARD, FINETUNE_CONF_HARD)
+
+    def __init__(self,
+                 model: str,
+                 work_dir: str,
+                 cfg_file: Optional[str] = None,
+                 arg_parse_fn: Optional[Callable] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 custom_conf: Optional[dict] = None,
+                 **kwargs):
+
+        if isinstance(model, str):
+            if os.path.exists(model):
+                self.model_dir = model if os.path.isdir(
+                    model) else os.path.dirname(model)
+            else:
+                self.model_dir = snapshot_download(
+                    model, revision=model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        super().__init__(cfg_file, arg_parse_fn)
+
+        self.model = self.build_model()
+        self.work_dir = work_dir
+        # the number of model output dimension
+        # should update config outside the trainer, if user need more wake word
+        self._num_classes = self.cfg.model.num_syn
+
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
+        device_name = kwargs.get('device', 'gpu')
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
+        self.device = create_device(device_name)
+        # model placement
+        if self.device.type == 'cuda':
+            self.model.to(self.device)
+
+        if 'max_epochs' not in kwargs:
+            assert hasattr(
+                self.cfg.train, 'max_epochs'
+            ), 'max_epochs is missing from the configuration file'
+            self._max_epochs = self.cfg.train.max_epochs
+        else:
+            self._max_epochs = kwargs['max_epochs']
+        self._train_iters = kwargs.get('train_iters_per_epoch', None)
+        self._val_iters = kwargs.get('val_iters_per_epoch', None)
+        if self._train_iters is None:
+            self._train_iters = self.cfg.train.train_iters_per_epoch
+        if self._val_iters is None:
+            self._val_iters = self.cfg.evaluation.val_iters_per_epoch
+        dataloader_config = self.cfg.train.dataloader
+        self._threads = kwargs.get('workers', None)
+        if self._threads is None:
+            self._threads = dataloader_config.workers_per_gpu
+        self._single_rate = BASETRAIN_RATIO
+        if 'single_rate' in kwargs:
+            self._single_rate = kwargs['single_rate']
+        self._batch_size = dataloader_config.batch_size_per_gpu
+        if 'model_bin' in kwargs:
+            model_bin_file = os.path.join(self.model_dir, kwargs['model_bin'])
+            checkpoint = torch.load(model_bin_file)
+            self.model.load_state_dict(checkpoint)
+        # build corresponding optimizer and loss function
+        lr = self.cfg.train.optimizer.lr
+        self.optimizer = optim.Adam(self.model.parameters(), lr)
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.data_val = None
+        self.json_log_path = os.path.join(self.work_dir,
+                                          '{}.log.json'.format(self.timestamp))
+        self.conf_files = []
+        for conf_key in self.conf_keys:
+            template_file = os.path.join(self.model_dir, conf_key)
+            conf_file = os.path.join(self.model_dir, f'{conf_key}.conf')
+            update_conf(template_file, conf_file, custom_conf[conf_key])
+            self.conf_files.append(conf_file)
+        self._current_epoch = 0
+        self.stages = (math.floor(self._max_epochs * EASY_RATIO),
+                       math.floor(self._max_epochs * NORMAL_RATIO),
+                       math.floor(self._max_epochs * HARD_RATIO))
+
+    def build_model(self) -> nn.Module:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, training=True)
+        if isinstance(model, TorchModel) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def train(self, *args, **kwargs):
+        if not self.data_val:
+            self.gen_val()
+        logger.info('Start training...')
+        totaltime = datetime.datetime.now()
+
+        for stage, num_epoch in enumerate(self.stages):
+            self.run_stage(stage, num_epoch)
+
+        # total time spent
+        totaltime = datetime.datetime.now() - totaltime
+        logger.info('Total time spent: {:.2f} hours\n'.format(
+            totaltime.total_seconds() / 3600.0))
+
+    def run_stage(self, stage, num_epoch):
+        """
+        Run training stages with correspond data
+
+        Args:
+            stage: id of stage
+            num_epoch: the number of epoch to run in this stage
+        """
+        if num_epoch <= 0:
+            logger.warning(f'Invalid epoch number, stage {stage} exit!')
+            return
+        logger.info(f'Starting stage {stage}...')
+        dataset, dataloader = self.create_dataloader(
+            self.conf_files[stage * 2], self.conf_files[stage * 2 + 1])
+        it = iter(dataloader)
+        for _ in range(num_epoch):
+            self._current_epoch += 1
+            epochtime = datetime.datetime.now()
+            logger.info('Start epoch %d...', self._current_epoch)
+            loss_train_epoch = 0.0
+            validbatchs = 0
+            for bi in range(self._train_iters):
+                # prepare data
+                feat, label = next(it)
+                label = torch.reshape(label, (-1, ))
+                feat = to_device(feat, self.device)
+                label = to_device(label, self.device)
+                # apply model
+                self.optimizer.zero_grad()
+                predict = self.model(feat)
+                # calculate loss
+                loss = self.loss_fn(
+                    torch.reshape(predict, (-1, self._num_classes)), label)
+                if not np.isnan(loss.item()):
+                    loss.backward()
+                    self.optimizer.step()
+                    loss_train_epoch += loss.item()
+                    validbatchs += 1
+                train_result = 'Epoch: {:04d}/{:04d}, batch: {:04d}/{:04d}, loss: {:.4f}'.format(
+                    self._current_epoch, self._max_epochs, bi + 1,
+                    self._train_iters, loss.item())
+                logger.info(train_result)
+                self._dump_log(train_result)
+
+            # average training loss in one epoch
+            loss_train_epoch /= validbatchs
+            loss_val_epoch = self.evaluate('')
+            val_result = 'Evaluate epoch: {:04d}, loss_train: {:.4f}, loss_val: {:.4f}'.format(
+                self._current_epoch, loss_train_epoch, loss_val_epoch)
+            logger.info(val_result)
+            self._dump_log(val_result)
+            # check point
+            ckpt_name = 'checkpoint_{:04d}_loss_train_{:.4f}_loss_val_{:.4f}.pth'.format(
+                self._current_epoch, loss_train_epoch, loss_val_epoch)
+            torch.save(self.model, os.path.join(self.work_dir, ckpt_name))
+            # time spent per epoch
+            epochtime = datetime.datetime.now() - epochtime
+            logger.info('Epoch {:04d} time spent: {:.2f} hours'.format(
+                self._current_epoch,
+                epochtime.total_seconds() / 3600.0))
+        dataloader.stop()
+        dataset.release()
+        logger.info(f'Stage {stage} is finished.')
+
+    def gen_val(self):
+        """
+        generate validation set
+        """
+        logger.info('Start generating validation set...')
+        dataset, dataloader = self.create_dataloader(self.conf_files[2],
+                                                     self.conf_files[3])
+        it = iter(dataloader)
+
+        self.data_val = []
+        for bi in range(self._val_iters):
+            logger.info('Iterating validation data %d', bi)
+            feat, label = next(it)
+            label = torch.reshape(label, (-1, ))
+            self.data_val.append([feat, label])
+
+        dataloader.stop()
+        dataset.release()
+        logger.info('Finish generating validation set!')
+
+    def create_dataloader(self, base_path, finetune_path):
+        dataset = KWSDataset(base_path, finetune_path, self._threads,
+                             self._single_rate, self._num_classes)
+        dataloader = KWSDataLoader(
+            dataset, batchsize=self._batch_size, numworkers=self._threads)
+        dataloader.start()
+        return dataset, dataloader
+
+    def evaluate(self, checkpoint_path: str, *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Start validation...')
+        loss_val_epoch = 0.0
+
+        with torch.no_grad():
+            for feat, label in self.data_val:
+                feat = to_device(feat, self.device)
+                label = to_device(label, self.device)
+                # apply model
+                predict = self.model(feat)
+                # calculate loss
+                loss = self.loss_fn(
+                    torch.reshape(predict, (-1, self._num_classes)), label)
+                loss_val_epoch += loss.item()
+        logger.info('Finish validation.')
+        return loss_val_epoch / self._val_iters
+
+    def _dump_log(self, msg):
+        if is_master():
+            with open(self.json_log_path, 'a+') as f:
+                f.write(msg)
+                f.write('\n')
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 4c2c45cc..647d9521 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import re
 import struct
 from typing import Union
 from urllib.parse import urlparse
@@ -37,6 +38,23 @@ def audio_norm(x):
     return x
 
 
+def update_conf(origin_config_file, new_config_file, conf_item: [str, str]):
+
+    def repl(matched):
+        key = matched.group(1)
+        if key in conf_item:
+            return conf_item[key]
+        else:
+            return None
+
+    with open(origin_config_file) as f:
+        lines = f.readlines()
+    with open(new_config_file, 'w') as f:
+        for line in lines:
+            line = re.sub(r'\$\{(.*)\}', repl, line)
+            f.write(line)
+
+
 def extract_pcm_from_wav(wav: bytes) -> bytes:
     data = wav
     if len(data) > 44:
diff --git a/requirements/audio.txt b/requirements/audio.txt
index d22ad8f1..742cf166 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -14,7 +14,11 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-py_sound_connect
+ptflops
+py_sound_connect>=0.1
+pytorch_wavelets
+PyWavelets>=1.0.0
+scikit-learn
 SoundFile>0.10
 sox
 torchaudio
diff --git a/tests/trainers/audio/test_kws_farfield_trainer.py b/tests/trainers/audio/test_kws_farfield_trainer.py
new file mode 100644
index 00000000..2631a542
--- /dev/null
+++ b/tests/trainers/audio/test_kws_farfield_trainer.py
@@ -0,0 +1,85 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+POS_FILE = 'data/test/audios/wake_word_with_label_xyxy.wav'
+NEG_FILE = 'data/test/audios/speech_with_noise.wav'
+NOISE_FILE = 'data/test/audios/speech_with_noise.wav'
+INTERF_FILE = 'data/test/audios/speech_with_noise.wav'
+REF_FILE = 'data/test/audios/farend_speech.wav'
+NOISE_2CH_FILE = 'data/test/audios/noise_2ch.wav'
+
+
+class TestKwsFarfieldTrainer(unittest.TestCase):
+    REVISION = 'beta'
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        print(f'tmp dir: {self.tmp_dir}')
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
+
+        train_pos_list = self.create_list('pos.list', POS_FILE)
+        train_neg_list = self.create_list('neg.list', NEG_FILE)
+        train_noise1_list = self.create_list('noise.list', NOISE_FILE)
+        train_noise2_list = self.create_list('noise_2ch.list', NOISE_2CH_FILE)
+        train_interf_list = self.create_list('interf.list', INTERF_FILE)
+        train_ref_list = self.create_list('ref.list', REF_FILE)
+
+        base_dict = dict(
+            train_pos_list=train_pos_list,
+            train_neg_list=train_neg_list,
+            train_noise1_list=train_noise1_list)
+        fintune_dict = dict(
+            train_pos_list=train_pos_list,
+            train_neg_list=train_neg_list,
+            train_noise1_list=train_noise1_list,
+            train_noise2_type='1',
+            train_noise1_ratio='0.2',
+            train_noise2_list=train_noise2_list,
+            train_interf_list=train_interf_list,
+            train_ref_list=train_ref_list)
+        self.custom_conf = dict(
+            basetrain_easy=base_dict,
+            basetrain_normal=base_dict,
+            basetrain_hard=base_dict,
+            finetune_easy=fintune_dict,
+            finetune_normal=fintune_dict,
+            finetune_hard=fintune_dict)
+
+    def create_list(self, list_name, audio_file):
+        pos_list_file = os.path.join(self.tmp_dir, list_name)
+        with open(pos_list_file, 'w') as f:
+            for i in range(10):
+                f.write(f'{os.path.join(os.getcwd(), audio_file)}\n')
+        train_pos_list = f'{pos_list_file}, 1.0'
+        return train_pos_list
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_normal(self):
+        kwargs = dict(
+            model=self.model_id,
+            work_dir=self.tmp_dir,
+            model_revision=self.REVISION,
+            workers=2,
+            max_epochs=2,
+            train_iters_per_epoch=2,
+            val_iters_per_epoch=1,
+            custom_conf=self.custom_conf)
+
+        trainer = build_trainer(
+            Trainers.speech_dfsmn_kws_char_farfield, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files,
+                      f'work_dir:{self.tmp_dir}')

From 144ffee2cfaa89389930cba4c991ce03493502d2 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:16:07 +0800
Subject: [PATCH 26/57] [to #42322933] Add explict model id in tts UT

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244
---
 tests/pipelines/test_text_to_speech.py | 89 +++++++++++---------------
 1 file changed, 36 insertions(+), 53 deletions(-)

diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 0caf1c84..9a1cd7b1 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -27,67 +27,50 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
 
     def setUp(self) -> None:
         self.task = Tasks.text_to_speech
-        zhcn_text = '今天北京天气怎么样'
-        en_text = 'How is the weather in Beijing?'
-        zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo']
-        enus_voice = ['andy', 'annie']
-        engb_voice = ['luca', 'luna']
-        self.tts_test_cases = []
-        for voice in zhcn_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'zh-cn')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': zhcn_text
-            })
-        for voice in enus_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'en-us')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': en_text
-            })
-        for voice in engb_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'en-gb')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': en_text
-            })
-        zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
-        enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k'
-        engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k'
-        self.tts_test_cases.append({
-            'voice': 'zhcn',
-            'model_id': zhcn_model_id,
-            'text': zhcn_text
-        })
-        self.tts_test_cases.append({
-            'voice': 'enus',
-            'model_id': enus_model_id,
-            'text': en_text
-        })
-        self.tts_test_cases.append({
-            'voice': 'engb',
-            'model_id': engb_model_id,
-            'text': en_text
-        })
+        self.zhcn_text = '今天北京天气怎么样'
+        self.en_text = 'How is the weather in Beijing?'
+        self.zhcn_voices = [
+            'zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo', 'zhcn'
+        ]
+        self.zhcn_models = [
+            'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhizhe_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhiyan_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhibei_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zh-cn_16k'
+        ]
+        self.en_voices = ['luca', 'luna', 'andy', 'annie', 'engb', 'enus']
+        self.en_models = [
+            'damo/speech_sambert-hifigan_tts_luca_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_luna_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_andy_en-us_16k',
+            'damo/speech_sambert-hifigan_tts_annie_en-us_16k',
+            'damo/speech_sambert-hifigan_tts_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_en-us_16k'
+        ]
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
-        for case in self.tts_test_cases:
-            logger.info('test %s' % case['voice'])
+        for i in range(len(self.zhcn_voices)):
+            logger.info('test %s' % self.zhcn_voices[i])
             model = Model.from_pretrained(
-                model_name_or_path=case['model_id'], revision='pytorch_am')
+                model_name_or_path=self.zhcn_models[i], revision='pytorch_am')
             sambert_hifigan_tts = pipeline(task=self.task, model=model)
             self.assertTrue(sambert_hifigan_tts is not None)
-            output = sambert_hifigan_tts(input=case['text'])
+            output = sambert_hifigan_tts(input=self.zhcn_text)
             self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
             pcm = output[OutputKeys.OUTPUT_PCM]
-            write('output_%s.wav' % case['voice'], 16000, pcm)
+            write('output_%s.wav' % self.zhcn_voices[i], 16000, pcm)
+        for i in range(len(self.en_voices)):
+            logger.info('test %s' % self.en_voices[i])
+            model = Model.from_pretrained(
+                model_name_or_path=self.en_models[i], revision='pytorch_am')
+            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            self.assertTrue(sambert_hifigan_tts is not None)
+            output = sambert_hifigan_tts(input=self.en_text)
+            self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
+            pcm = output[OutputKeys.OUTPUT_PCM]
+            write('output_%s.wav' % self.en_voices[i], 16000, pcm)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From f63d7f18f14dc919297ec104bef56cf2e1990bfc Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:39:56 +0800
Subject: [PATCH 27/57] [to #42322933]remove sleep in train_loop         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9910419

---
 modelscope/trainers/trainer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 4c21d63f..9eaff762 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -828,7 +828,6 @@ class EpochBasedTrainer(BaseTrainer):
         self.model.train()
         for _ in range(self._epoch, self._max_epochs):
             self.invoke_hook(TrainerStages.before_train_epoch)
-            time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
                 if i < self.inner_iter:
                     # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.
@@ -852,7 +851,6 @@ class EpochBasedTrainer(BaseTrainer):
             self._inner_iter = 0
             self._epoch += 1
 
-        time.sleep(1)  # wait for some hooks like loggers to finish
         self.invoke_hook(TrainerStages.after_run)
 
     def evaluation_loop(self, data_loader, metric_classes):

From 0eb823b76490bb3249bf1420143873293a132fb7 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:52:40 +0800
Subject: [PATCH 28/57] [to #42322933] support t5_with_translation        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10383770

    * T5 support translate
---
 modelscope/metainfo.py                        |  4 ++
 .../nlp/text2text_generation_pipeline.py      | 39 ++++++++++++++++---
 modelscope/preprocessors/nlp/nlp_base.py      |  3 +-
 modelscope/utils/config.py                    | 10 +++++
 tests/pipelines/test_text2text_generation.py  | 26 +++++++------
 5 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 46c3b138..59c779e9 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -228,6 +228,9 @@ class Pipelines(object):
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    translation_en_to_de = 'translation_en_to_de'  # keep it underscore
+    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
+    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -314,6 +317,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     text2text_gen_preprocessor = 'text2text-gen-preprocessor'
+    text2text_translate_preprocessor = 'text2text-translate-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
index 21aacf54..a739df69 100644
--- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -1,21 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
+from numpy import isin
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Input, Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.config import use_task_specific_params
 from modelscope.utils.constant import Tasks
 
 __all__ = ['Text2TextGenerationPipeline']
 
+TRANSLATE_PIPELINES = [
+    Pipelines.translation_en_to_de,
+    Pipelines.translation_en_to_ro,
+    Pipelines.translation_en_to_fr,
+]
+
 
 @PIPELINES.register_module(
     Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr)
 class Text2TextGenerationPipeline(Pipeline):
 
     def __init__(
@@ -39,13 +53,13 @@ class Text2TextGenerationPipeline(Pipeline):
 
             Example:
             >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='text-generation',
-            >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
-            >>> sentence1 = '本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：'
-            >>>     '1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代'
+            >>> pipeline_ins = pipeline(task='text2text-generation',
+            >>>    model='damo/nlp_t5_text2text-generation_chinese-base')
+            >>> sentence1 = '中国的首都位于<extra_id_0>。'
             >>> print(pipeline_ins(sentence1))
             >>> # Or use the dict input:
             >>> print(pipeline_ins({'sentence': sentence1}))
+            >>> # 北京
 
             To view other examples plese check the tests/pipelines/test_text_generation.py.
         """
@@ -56,9 +70,22 @@ class Text2TextGenerationPipeline(Pipeline):
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
         self.tokenizer = preprocessor.tokenizer
+        self.pipeline = model.pipeline.type
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks
+        """
+        if not isinstance(inputs, str):
+            raise ValueError(f'Not supported input type: {type(inputs)}')
+
+        if self.pipeline in TRANSLATE_PIPELINES:
+            use_task_specific_params(self.model, self.pipeline)
+            inputs = self.model.config.prefix + inputs
+
+        return super().preprocess(inputs, **preprocess_params)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
 
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index a9be0cb0..bec7e4e1 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -12,7 +12,8 @@ from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.config import (Config, ConfigFields,
+                                     use_task_specific_params)
 from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 0b966bef..c4fa3c1b 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -633,6 +633,16 @@ def check_config(cfg: Union[str, ConfigDict]):
         check_attr(ConfigFields.evaluation)
 
 
+def use_task_specific_params(model, task):
+    """Update config with summarization specific params."""
+    task_specific_params = model.config.task_specific_params
+
+    if task_specific_params is not None:
+        pars = task_specific_params.get(task, {})
+        logger.info(f'using task specific params for {task}: {pars}')
+        model.config.update(pars)
+
+
 class JSONIteratorEncoder(json.JSONEncoder):
     """Implement this method in order that supporting arbitrary iterators, it returns
         a serializable object for ``obj``, or calls the base implementation
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index 2506547e..d90263c4 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -15,42 +15,44 @@ from modelscope.utils.test_utils import test_level
 class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/t5-cn-base-test'
-        self.input = '中国的首都位于<extra_id_0>。'
+        self.model_id_generate = 'damo/t5-cn-base-test'
+        self.input_generate = '中国的首都位于<extra_id_0>。'
+        self.model_id_translate = 'damo/t5-translate-base-test'
+        self.input_translate = 'My name is Wolfgang and I live in Berlin'
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_T5(self):
-        cache_path = snapshot_download(self.model_id)
-        model = T5ForConditionalGeneration(cache_path)
+        cache_path = snapshot_download(self.model_id_generate)
+        model = T5ForConditionalGeneration.from_pretrained(cache_path)
         preprocessor = Text2TextGenerationPreprocessor(cache_path)
         pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.text2text_generation, model=model, preprocessor=preprocessor)
         print(
-            f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
+            f'pipeline1: {pipeline1(self.input_generate)}\npipeline2: {pipeline2(self.input_generate)}'
         )
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_instance(self):
-        model = Model.from_pretrained(self.model_id)
+        model = Model.from_pretrained(self.model_id_translate)
         preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation,
             model=model,
             preprocessor=preprocessor)
-        print(pipeline_ins(self.input))
+        print(pipeline_ins(self.input_translate))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
-            task=Tasks.text2text_generation, model=self.model_id)
-        print(pipeline_ins(self.input))
+            task=Tasks.text2text_generation, model=self.model_id_translate)
+        print(pipeline_ins(self.input_translate))
 
     @unittest.skip(
         'only for test cases, there is no default official model yet')
     def test_run_pipeline_without_model_id(self):
         pipeline_ins = pipeline(task=Tasks.text2text_generation)
-        print(pipeline_ins(self.input))
+        print(pipeline_ins(self.input_generate))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 2d50c812df09c3423fe8ccbc12b15eaae5706c79 Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Thu, 13 Oct 2022 13:48:11 +0800
Subject: [PATCH 29/57] [to #42322933] support finetune on cv/hand_2d_keypoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加2d手部关键点检测finetune功能
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371710
---
 modelscope/metainfo.py                        |  2 +
 .../models/cv/hand_2d_keypoints/__init__.py   | 20 ++++++
 .../cv/hand_2d_keypoints/hand_2d_keypoints.py | 16 +++++
 .../cv/hand_2d_keypoints/__init__.py          | 22 ++++++
 .../hand_2d_keypoints_dataset.py              | 38 ++++++++++
 .../test_easycv_trainer_hand_2d_keypoints.py  | 72 +++++++++++++++++++
 6 files changed, 170 insertions(+)
 create mode 100644 modelscope/models/cv/hand_2d_keypoints/__init__.py
 create mode 100644 modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
 create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
 create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
 create mode 100644 tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 59c779e9..2e3fed98 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -50,6 +50,7 @@ class Models(object):
     # EasyCV models
     yolox = 'YOLOX'
     segformer = 'Segformer'
+    hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
     image_object_detection_auto = 'image-object-detection-auto'
 
     # nlp models
@@ -439,6 +440,7 @@ class Datasets(object):
     """
     ClsDataset = 'ClsDataset'
     Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
     HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
diff --git a/modelscope/models/cv/hand_2d_keypoints/__init__.py b/modelscope/models/cv/hand_2d_keypoints/__init__.py
new file mode 100644
index 00000000..2b06f19a
--- /dev/null
+++ b/modelscope/models/cv/hand_2d_keypoints/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_2d_keypoints import Hand2dKeyPoints
+
+else:
+    _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
new file mode 100644
index 00000000..15a97c30
--- /dev/null
+++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.pose import TopDown
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
+class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        TopDown.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
new file mode 100644
index 00000000..5c1c72c1
--- /dev/null
+++ b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_2d_keypoints_dataset import Hand2DKeypointDataset
+
+else:
+    _import_structure = {
+        'hand_2d_keypoints_dataset': ['Hand2DKeypointDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
new file mode 100644
index 00000000..89ee0bb8
--- /dev/null
+++ b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.pose import \
+    HandCocoWholeBodyDataset as _HandCocoWholeBodyDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.hand_2d_keypoints,
+    module_name=Datasets.HandCocoWholeBodyDataset)
+class HandCocoWholeBodyDataset(EasyCVBaseDataset, _HandCocoWholeBodyDataset):
+    """EasyCV dataset for human hand 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _HandCocoWholeBodyDataset.__init__(self, *args, **kwargs)
diff --git a/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py
new file mode 100644
index 00000000..270ecbc4
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py
@@ -0,0 +1,72 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestHand2dKeypoints(unittest.TestCase):
+    model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+        cfg_options = {'train.max_epochs': 20}
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='cv_hand_2d_keypoints_coco_wholebody',
+            namespace='chenhyer',
+            split='subtrain',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        eval_dataset = MsDataset.load(
+            dataset_name='cv_hand_2d_keypoints_coco_wholebody',
+            namespace='chenhyer',
+            split='subtrain',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_single_gpu(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_10.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_20.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 14e52b308aa6e67564b230ae49b0615615d752ec Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 14:41:26 +0800
Subject: [PATCH 30/57] fix token classification bugs         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10385225

    * fix token classification bugs
---
 modelscope/models/nlp/bert/__init__.py        | 12 ++------
 modelscope/models/nlp/bert/modeling_bert.py   | 25 ++++++++--------
 modelscope/models/nlp/token_classification.py | 29 +++++++++++++++++--
 .../nlp/token_classification_pipeline.py      |  7 ++++-
 4 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
index 705d9519..cca79c2f 100644
--- a/modelscope/models/nlp/bert/__init__.py
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .modeling_bert import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         BertForMaskedLM,
         BertForMultipleChoice,
         BertForNextSentencePrediction,
@@ -20,21 +19,14 @@ if TYPE_CHECKING:
         load_tf_weights_in_bert,
     )
 
-    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
-    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
-    from .tokenization_bert_fast import BertTokenizerFast
+    from .configuration_bert import BertConfig, BertOnnxConfig
 
 else:
     _import_structure = {
-        'configuration_bert':
-        ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'],
-        'tokenization_bert':
-        ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'],
+        'configuration_bert': ['BertConfig', 'BertOnnxConfig'],
     }
-    _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast']
 
     _import_structure['modeling_bert'] = [
-        'BERT_PRETRAINED_MODEL_ARCHIVE_LIST',
         'BertForMaskedLM',
         'BertForMultipleChoice',
         'BertForNextSentencePrediction',
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
index f8fd5994..e91a6433 100755
--- a/modelscope/models/nlp/bert/modeling_bert.py
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -1872,19 +1872,18 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     @add_start_docstrings_to_model_forward(
         BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
         *optional*):
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index c63e8037..e58967a5 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -176,7 +176,7 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
 @MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
-class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
+class BertForTokenClassification(TokenClassification, BertPreTrainedModel):
     """Bert token classification model.
 
         Inherited from TokenClassificationBase.
@@ -187,7 +187,7 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
 
     def __init__(self, config, model_dir):
         if hasattr(config, 'base_model_prefix'):
-            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+            BertForTokenClassification.base_model_prefix = config.base_model_prefix
         super().__init__(config, model_dir)
 
     def build_base_model(self):
@@ -218,3 +218,28 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(BertPreTrainedModel,
+                     BertForTokenClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 5367c1a8..c57dbf20 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -40,7 +40,12 @@ class TokenClassificationPipeline(Pipeline):
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = getattr(model, 'id2label')
+        if hasattr(model, 'id2label'):
+            self.id2label = getattr(model, 'id2label')
+        else:
+            model_config = getattr(model, 'config')
+            self.id2label = getattr(model_config, 'id2label')
+
         assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
                                           'as a parameter or make sure the preprocessor has the attribute.'
 

From 383452b0a4be12d3a5d15417042d7ccf3e285301 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 17:16:17 +0800
Subject: [PATCH 31/57] [to #45452180] python 3.10.x compatible         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10394282

    * python 3.10.x compatible
---
 modelscope/utils/tensor_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index b68a639c..406d671f 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
-from collections import Mapping
+from collections.abc import Mapping
 
 
 def torch_nested_numpify(tensors):

From 5bdb8fb78b5cb0d01431891d8e55cb5510a4ece4 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Thu, 13 Oct 2022 18:30:06 +0800
Subject: [PATCH 32/57] [to #45451935]fix: add create model detail log for
 create failed.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10382795

---
 modelscope/hub/api.py    | 24 +++++++++++-------------
 modelscope/hub/errors.py | 17 +++++++++++++++--
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 8dcfa5b0..214045dd 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -24,8 +24,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DownloadMode)
 from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
-                     datahub_raise_on_error, handle_http_response, is_ok,
-                     raise_on_error)
+                     datahub_raise_on_error, handle_http_post_error,
+                     handle_http_response, is_ok, raise_on_error)
 from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
                           model_id_to_group_owner_name)
 
@@ -105,17 +105,15 @@ class HubApi:
 
         path = f'{self.endpoint}/api/v1/models'
         owner_or_group, name = model_id_to_group_owner_name(model_id)
-        r = requests.post(
-            path,
-            json={
-                'Path': owner_or_group,
-                'Name': name,
-                'ChineseName': chinese_name,
-                'Visibility': visibility,  # server check
-                'License': license
-            },
-            cookies=cookies)
-        r.raise_for_status()
+        body = {
+            'Path': owner_or_group,
+            'Name': name,
+            'ChineseName': chinese_name,
+            'Visibility': visibility,  # server check
+            'License': license
+        }
+        r = requests.post(path, json=body, cookies=cookies)
+        handle_http_post_error(r, path, body)
         raise_on_error(r.json())
         model_repo_url = f'{get_endpoint()}/{model_id}'
         return model_repo_url
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index c095a6ec..fb483287 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -4,6 +4,10 @@ from http import HTTPStatus
 
 from requests.exceptions import HTTPError
 
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
 
 class NotExistError(Exception):
     pass
@@ -45,15 +49,24 @@ def is_ok(rsp):
     return rsp['Code'] == HTTPStatus.OK and rsp['Success']
 
 
+def handle_http_post_error(response, url, request_body):
+    try:
+        response.raise_for_status()
+    except HTTPError as error:
+        logger.error('Request %s with body: %s exception, respoonse body: %s' %
+                     (url, request_body, response.body))
+        raise error
+
+
 def handle_http_response(response, logger, cookies, model_id):
     try:
         response.raise_for_status()
-    except HTTPError:
+    except HTTPError as error:
         if cookies is None:  # code in [403] and
             logger.error(
                 f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                 private. Please login first.')
-        raise
+        raise error
 
 
 def raise_on_error(rsp):

From 6818ffdc8e598b5a8aeb525c05549b9bce5b3784 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 13 Oct 2022 19:42:19 +0800
Subject: [PATCH 33/57] [to #42322933] feat: optimize ANS metric value        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10399100

---
 modelscope/metrics/audio_noise_metric.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
index f26db46d..8555e95b 100644
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric):
         total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
         return {
             'total_loss': total_loss.item(),
-            'avg_sisnr': avg_sisnr.item(),
+            # model use opposite number of sisnr as a calculation shortcut.
+            # revert it in evaluation result
+            'avg_sisnr': -avg_sisnr.item(),
             MetricKeys.AVERAGE_LOSS: avg_loss.item()
         }

From c5c14ad60a8ba573263078892ada19f47698fc1c Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Thu, 13 Oct 2022 22:25:57 +0800
Subject: [PATCH 34/57] [to #42322933]fix psnr/ssim metrics for NAFNet (image
 denoise)         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10403246

---
 modelscope/metrics/image_denoise_metric.py    | 203 +++++++++++++-----
 .../image_denoise/nafnet_for_image_denoise.py |  10 +-
 .../msdatasets/task_datasets/__init__.py      |   1 +
 3 files changed, 154 insertions(+), 60 deletions(-)

diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py
index c6df8df1..1692f299 100644
--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -1,14 +1,16 @@
-# The code is modified based on BasicSR metrics:
-# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
+# ------------------------------------------------------------------------
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# ------------------------------------------------------------------------
+# modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py
+# ------------------------------------------------------------------------
 from typing import Dict
 
 import cv2
 import numpy as np
+import torch
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 from .base import Metric
 from .builder import METRICS, MetricKeys
 
@@ -22,16 +24,15 @@ class ImageDenoiseMetric(Metric):
     label_name = 'target'
 
     def __init__(self):
+        super(ImageDenoiseMetric, self).__init__()
         self.preds = []
         self.labels = []
 
     def add(self, outputs: Dict, inputs: Dict):
         ground_truths = outputs[ImageDenoiseMetric.label_name]
         eval_results = outputs[ImageDenoiseMetric.pred_name]
-        self.preds.append(
-            torch_nested_numpify(torch_nested_detach(eval_results)))
-        self.labels.append(
-            torch_nested_numpify(torch_nested_detach(ground_truths)))
+        self.preds.append(eval_results)
+        self.labels.append(ground_truths)
 
     def evaluate(self):
         psnr_list, ssim_list = [], []
@@ -69,80 +70,117 @@ def reorder_image(img, input_order='HWC'):
     return img
 
 
-def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs):
+def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
     """Calculate PSNR (Peak Signal-to-Noise Ratio).
-    Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
     Args:
-        img (ndarray): Images with range [0, 255].
-        img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
-        input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'.
+        img1 (ndarray/tensor): Images with range [0, 255]/[0, 1].
+        img2 (ndarray/tensor): Images with range [0, 255]/[0, 1].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the PSNR calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
     Returns:
-        float: PSNR result.
+        float: psnr result.
     """
 
-    assert img.shape == img2.shape, (
-        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
     if input_order not in ['HWC', 'CHW']:
         raise ValueError(
-            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
-        )
-    img = reorder_image(img, input_order=input_order)
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+    if type(img1) == torch.Tensor:
+        if len(img1.shape) == 4:
+            img1 = img1.squeeze(0)
+        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
+    if type(img2) == torch.Tensor:
+        if len(img2.shape) == 4:
+            img2 = img2.squeeze(0)
+        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
+
+    img1 = reorder_image(img1, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
-
-    if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
-        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
-
-    img = img.astype(np.float64)
+    img1 = img1.astype(np.float64)
     img2 = img2.astype(np.float64)
 
-    mse = np.mean((img - img2)**2)
-    if mse == 0:
-        return float('inf')
-    return 10. * np.log10(255. * 255. / mse)
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    def _psnr(img1, img2):
+
+        mse = np.mean((img1 - img2)**2)
+        if mse == 0:
+            return float('inf')
+        max_value = 1. if img1.max() <= 1 else 255.
+        return 20. * np.log10(max_value / np.sqrt(mse))
+
+    return _psnr(img1, img2)
 
 
-def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs):
+def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
     """Calculate SSIM (structural similarity).
-    ``Paper: Image quality assessment: From error visibility to structural similarity``
+    Ref:
+    Image quality assessment: From error visibility to structural similarity
     The results are the same as that of the official released MATLAB code in
     https://ece.uwaterloo.ca/~z70wang/research/ssim/.
     For three-channel images, SSIM is calculated for each channel and then
     averaged.
     Args:
-        img (ndarray): Images with range [0, 255].
+        img1 (ndarray): Images with range [0, 255].
         img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the SSIM calculation.
         input_order (str): Whether the input order is 'HWC' or 'CHW'.
             Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
     Returns:
-        float: SSIM result.
+        float: ssim result.
     """
 
-    assert img.shape == img2.shape, (
-        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
     if input_order not in ['HWC', 'CHW']:
         raise ValueError(
-            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
-        )
-    img = reorder_image(img, input_order=input_order)
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+
+    if type(img1) == torch.Tensor:
+        if len(img1.shape) == 4:
+            img1 = img1.squeeze(0)
+        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
+    if type(img2) == torch.Tensor:
+        if len(img2.shape) == 4:
+            img2 = img2.squeeze(0)
+        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
+
+    img1 = reorder_image(img1, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
 
-    if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
-        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
-
-    img = img.astype(np.float64)
+    img1 = img1.astype(np.float64)
     img2 = img2.astype(np.float64)
 
-    ssims = []
-    for i in range(img.shape[2]):
-        ssims.append(_ssim(img[..., i], img2[..., i]))
-    return np.array(ssims).mean()
+    if crop_border != 0:
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    def _cal_ssim(img1, img2):
+        ssims = []
+
+        max_value = 1 if img1.max() <= 1 else 255
+        with torch.no_grad():
+            final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim(
+                img1, img2, max_value)
+            ssims.append(final_ssim)
+
+        return np.array(ssims).mean()
+
+    return _cal_ssim(img1, img2)
 
 
-def _ssim(img, img2):
+def _ssim(img, img2, max_value):
     """Calculate SSIM (structural similarity) for one channel images.
     It is called by func:`calculate_ssim`.
     Args:
@@ -152,8 +190,11 @@ def _ssim(img, img2):
         float: SSIM result.
     """
 
-    c1 = (0.01 * 255)**2
-    c2 = (0.03 * 255)**2
+    c1 = (0.01 * max_value)**2
+    c2 = (0.03 * max_value)**2
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
     kernel = cv2.getGaussianKernel(11, 1.5)
     window = np.outer(kernel, kernel.transpose())
 
@@ -171,3 +212,61 @@ def _ssim(img, img2):
     tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
     ssim_map = tmp1 / tmp2
     return ssim_map.mean()
+
+
+def _3d_gaussian_calculator(img, conv3d):
+    out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
+    return out
+
+
+def _generate_3d_gaussian_kernel():
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+    kernel_3 = cv2.getGaussianKernel(11, 1.5)
+    kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0))
+    conv3d = torch.nn.Conv3d(
+        1,
+        1, (11, 11, 11),
+        stride=1,
+        padding=(5, 5, 5),
+        bias=False,
+        padding_mode='replicate')
+    conv3d.weight.requires_grad = False
+    conv3d.weight[0, 0, :, :, :] = kernel
+    return conv3d
+
+
+def _ssim_3d(img1, img2, max_value):
+    assert len(img1.shape) == 3 and len(img2.shape) == 3
+    """Calculate SSIM (structural similarity) for one channel images.
+    It is called by func:`calculate_ssim`.
+    Args:
+        img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
+    Returns:
+        float: ssim result.
+    """
+    C1 = (0.01 * max_value)**2
+    C2 = (0.03 * max_value)**2
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    kernel = _generate_3d_gaussian_kernel().cuda()
+
+    img1 = torch.tensor(img1).float().cuda()
+    img2 = torch.tensor(img2).float().cuda()
+
+    mu1 = _3d_gaussian_calculator(img1, kernel)
+    mu2 = _3d_gaussian_calculator(img2, kernel)
+
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq
+    sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq
+    sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2
+
+    tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2)
+    tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    ssim_map = tmp1 / tmp2
+    return float(ssim_map.mean())
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index a6fbf22f..4e8fc0ed 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -3,7 +3,6 @@ import os
 from copy import deepcopy
 from typing import Any, Dict, Union
 
-import numpy as np
 import torch.cuda
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 
@@ -78,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel):
     def _evaluate_postprocess(self, input: Tensor,
                               target: Tensor) -> Dict[str, list]:
         preds = self.model(input)
-        preds = list(torch.split(preds, 1, 0))
-        targets = list(torch.split(target, 1, 0))
-
-        preds = [(pred.data * 255.).squeeze(0).permute(
-            1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds]
-        targets = [(target.data * 255.).squeeze(0).permute(
-            1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets]
+        preds = list(torch.split(preds.clamp(0, 1), 1, 0))
+        targets = list(torch.split(target.clamp(0, 1), 1, 0))
 
         return {'pred': preds, 'target': targets}
 
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 35c060f0..7c31969a 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -26,6 +26,7 @@ else:
         'video_summarization_dataset': ['VideoSummarizationDataset'],
         'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
         'image_inpainting': ['ImageInpaintingDataset'],
+        'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
     }
     import sys
 

From 275f8b432328cfe9df38a105e616233c63efb6a1 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 14 Oct 2022 13:55:09 +0800
Subject: [PATCH 35/57] Revert "[to #45071449] fix setup error "

This reverts commit a26e6e38697a8795b99de4c7929b415baef78268.
---
 modelscope/models/audio/tts/models/datasets/__init__.py | 0
 requirements/framework.txt                              | 1 -
 2 files changed, 1 deletion(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
diff --git a/requirements/framework.txt b/requirements/framework.txt
index aae200da..b51faeda 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -15,7 +15,6 @@ pyyaml
 requests
 scipy
 setuptools
-setuptools_scm
 tensorboard
 tqdm>=4.64.0
 yapf

From 155856301f0e4f61be0d4753734f1496e7cbf7ce Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Fri, 14 Oct 2022 14:00:57 +0800
Subject: [PATCH 36/57] [to #42322933] do not check training config in
 pipeline()         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10407849

---
 modelscope/utils/config.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index c4fa3c1b..e46da7df 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -609,11 +609,12 @@ class Config:
         return parse_fn(args)
 
 
-def check_config(cfg: Union[str, ConfigDict]):
+def check_config(cfg: Union[str, ConfigDict], is_training=False):
     """ Check whether configuration file is valid, If anything wrong, exception will be raised.
 
     Args:
         cfg (str or ConfigDict): Config file path or config object.
+        is_training: indicate if checking training related elements
     """
 
     if isinstance(cfg, str):
@@ -627,8 +628,9 @@ def check_config(cfg: Union[str, ConfigDict]):
     check_attr(ConfigFields.task)
     check_attr(ConfigFields.pipeline)
 
-    if hasattr(cfg, ConfigFields.train):
+    if is_training:
         check_attr(ConfigFields.model)
+        check_attr(ConfigFields.train)
         check_attr(ConfigFields.preprocessor)
         check_attr(ConfigFields.evaluation)
 

From 355da866c553216a2b45b5f1ae68a27eebcf62ec Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:07:29 +0800
Subject: [PATCH 37/57] [to #42322933] limit tranformers version temporarily

---
 requirements/nlp.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index f18dde2e..2e0838fc 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -3,7 +3,7 @@ fasttext
 jieba>=0.42.1
 megatron_util
 pai-easynlp
-# “protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.”
+# protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
@@ -14,4 +14,5 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-transformers>=4.12.0
+# recent 4.23.1 update introduce breaking api change, limit upper version temporarily.
+transformers>=4.12.0,<=4.22.0

From 876058556deabcdf1a399e79983444d97ec790f2 Mon Sep 17 00:00:00 2001
From: hemu <hemu.zp@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:15:52 +0800
Subject: [PATCH 38/57] fix generate

---
 modelscope/models/nlp/gpt3/modeling_gpt3.py | 3 +++
 requirements/nlp.txt                        | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 498d15de..ade36e36 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -346,3 +346,6 @@ class GPT3Model(PreTrainedModel):
         }
         model.load_state_dict(state_dict)
         return model
+
+    def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
+        return {'input_ids': input_ids}
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 2e0838fc..123c238e 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -14,5 +14,4 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-# recent 4.23.1 update introduce breaking api change, limit upper version temporarily.
-transformers>=4.12.0,<=4.22.0
+transformers

From 1b4d5ccb9c8b7a7d93c91aa85e43b017826df2c0 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:32:38 +0800
Subject: [PATCH 39/57] [to #42322933]MsDataset upload and load supports
 directory.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

上传和下载支持多文件操作
---
 modelscope/hub/api.py                         | 34 ++++---
 modelscope/hub/utils/utils.py                 |  8 +-
 modelscope/msdatasets/ms_dataset.py           | 52 +++++++----
 modelscope/msdatasets/utils/dataset_utils.py  | 90 ++++++++++++++++++-
 modelscope/msdatasets/utils/download_utils.py | 18 ++--
 modelscope/msdatasets/utils/oss_utils.py      |  9 +-
 modelscope/msdatasets/utils/upload_utils.py   | 40 ++++++++-
 modelscope/utils/constant.py                  |  7 ++
 tests/msdatasets/test_dataset_upload.py       | 43 ++++++++-
 9 files changed, 250 insertions(+), 51 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 214045dd..dc4d0ab2 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -26,18 +26,15 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                      datahub_raise_on_error, handle_http_post_error,
                      handle_http_response, is_ok, raise_on_error)
-from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
-                          model_id_to_group_owner_name)
+from .utils.utils import get_endpoint, model_id_to_group_owner_name
 
 logger = get_logger()
 
 
 class HubApi:
 
-    def __init__(self, endpoint=None, dataset_endpoint=None):
+    def __init__(self, endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
-        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
-        )
 
     def login(
         self,
@@ -288,7 +285,7 @@ class HubApi:
         return files
 
     def list_datasets(self):
-        path = f'{self.dataset_endpoint}/api/v1/datasets'
+        path = f'{self.endpoint}/api/v1/datasets'
         headers = None
         params = {}
         r = requests.get(path, params=params, headers=headers)
@@ -315,13 +312,13 @@ class HubApi:
                 cache_dir):
             shutil.rmtree(cache_dir)
         os.makedirs(cache_dir, exist_ok=True)
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
         r = requests.get(datahub_url)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
         dataset_id = resp['Data']['Id']
         dataset_type = resp['Data']['Type']
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
         r = requests.get(datahub_url)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
@@ -339,7 +336,7 @@ class HubApi:
             file_path = file_info['Path']
             extension = os.path.splitext(file_path)[-1]
             if extension in dataset_meta_format:
-                datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
+                datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                               f'Revision={revision}&FilePath={file_path}'
                 r = requests.get(datahub_url)
                 r.raise_for_status()
@@ -363,7 +360,7 @@ class HubApi:
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
         if file_name.endswith('.csv'):
-            file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
+            file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                         f'Revision={revision}&FilePath={file_name}'
         return file_name
 
@@ -372,7 +369,7 @@ class HubApi:
             dataset_name: str,
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                       f'ststoken?Revision={revision}'
         return self.datahub_remote_call(datahub_url)
 
@@ -383,7 +380,7 @@ class HubApi:
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
 
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                       f'ststoken?Revision={revision}'
 
         cookies = requests.utils.dict_from_cookiejar(cookies)
@@ -392,6 +389,19 @@ class HubApi:
         raise_on_error(resp)
         return resp['Data']
 
+    def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
+                                 is_recursive, is_filter_dir, revision,
+                                 cookies):
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
+            f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
+        cookies = requests.utils.dict_from_cookiejar(cookies)
+
+        resp = requests.get(url=url, cookies=cookies)
+        resp = resp.json()
+        raise_on_error(resp)
+        resp = resp['Data']
+        return resp
+
     def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
         r = requests.post(url)
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index d84b78ea..7d3c2499 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,8 +4,7 @@ import hashlib
 import os
 from typing import Optional
 
-from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
-                                      DEFAULT_MODELSCOPE_DOMAIN,
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
@@ -44,11 +43,6 @@ def get_endpoint():
     return MODELSCOPE_URL_SCHEME + modelscope_domain
 
 
-def get_dataset_hub_endpoint():
-    return os.environ.get('HUB_DATASET_ENDPOINT',
-                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)
-
-
 def compute_hash(file_path):
     BUFFER_SIZE = 1024 * 64  # 64k buffer size
     sha256_hash = hashlib.sha256()
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 361b8ae0..cf055d6d 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
@@ -17,19 +16,18 @@ from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
 from modelscope.hub.repository import DatasetRepository
+from modelscope.msdatasets.task_datasets.builder import build_task_dataset
+from modelscope.msdatasets.utils.dataset_builder import ExternalDataset
+from modelscope.msdatasets.utils.dataset_utils import (
+    get_dataset_files, get_target_dataset_structure, load_dataset_builder)
+from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager
+from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
 from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
                                        DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
-from .task_datasets.builder import build_task_dataset
-from .utils.dataset_builder import ExternalDataset
-from .utils.dataset_utils import (get_dataset_files,
-                                  get_target_dataset_structure,
-                                  load_dataset_builder)
-from .utils.download_utils import DatasetDownloadManager
-from .utils.upload_utils import DatasetUploadManager
 
 logger = get_logger()
 
@@ -234,7 +232,6 @@ class MsDataset:
                 # dataset organized to be compatible with hf format
                 if dataset_formation == DatasetFormations.hf_compatible:
                     dataset_name = dataset_scripts['.py'][0]
-                    download_dataset = dataset_name
             else:
                 raise FileNotFoundError(
                     f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -270,7 +267,8 @@ class MsDataset:
             raise TypeError('path must be a str or a list, but got'
                             f' {type(dataset_name)}')
 
-        if download_dataset:
+        is_ci_test = os.getenv('CI_TEST') == 'True'
+        if download_dataset and not is_ci_test:
             try:
                 api.on_dataset_download(
                     dataset_name=download_dataset, namespace=namespace)
@@ -570,15 +568,26 @@ class MsDataset:
                local_file_path: str,
                dataset_name: str,
                namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
-               version: Optional[str] = DEFAULT_DATASET_REVISION) -> None:
-        """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first.
+               version: Optional[str] = DEFAULT_DATASET_REVISION,
+               num_processes: Optional[int] = None,
+               chunksize: Optional[int] = 1,
+               filter_hidden_files: Optional[bool] = True) -> None:
+        """Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first.
 
         Args:
-            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip
-            local_file_path (str): Local file to upload
+            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name
+            local_file_path (str): Local file or directory to upload
             dataset_name (str): Name of the dataset
             namespace(str, optional): Namespace of the dataset
             version: Optional[str]: Version of the dataset
+            num_processes: Optional[int]: The number of processes used for multi-process uploading.
+                This is only applicable when local_file_path is a directory, and we are uploading mutliple-files
+                insided the directory. When None provided, the number returned by os.cpu_count() is used as default.
+            chunksize: Optional[int]: The chunksize of objects to upload.
+                For very long iterables using a large value for chunksize can make the job complete much faster than
+                using the default value of 1. Available if local_file_path is a directory.
+            filter_hidden_files: Optional[bool]: Whether to filter hidden files.
+                Available if local_file_path is a directory.
 
         Returns:
             None
@@ -586,7 +595,20 @@ class MsDataset:
         """
         _upload_manager = DatasetUploadManager(
             dataset_name=dataset_name, namespace=namespace, version=version)
-        _upload_manager.upload(object_name, local_file_path)
+
+        if os.path.isfile(local_file_path):
+            _upload_manager.upload(
+                object_name=object_name, local_file_path=local_file_path)
+        elif os.path.isdir(local_file_path):
+            _upload_manager.upload_dir(
+                object_dir_name=object_name,
+                local_dir_path=local_file_path,
+                num_processes=num_processes,
+                chunksize=chunksize,
+                filter_hidden_files=filter_hidden_files)
+        else:
+            raise ValueError(
+                f'{local_file_path} is not a valid file path or directory')
 
     @staticmethod
     def clone_meta(dataset_work_dir: str,
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index ef42f75f..db9d1fee 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -6,7 +6,8 @@ from typing import Any, Mapping, Optional, Sequence, Union
 
 from datasets.builder import DatasetBuilder
 
-from modelscope.utils.constant import DEFAULT_DATASET_REVISION
+from modelscope.hub.api import HubApi
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams
 from modelscope.utils.logger import get_logger
 from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 
@@ -77,6 +78,81 @@ def get_target_dataset_structure(dataset_structure: dict,
     return target_subset_name, target_dataset_structure
 
 
+def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool,
+                         dataset_name: str, namespace: str,
+                         version: str) -> list:
+    """
+    List all of objects for specific dataset.
+
+    Args:
+        hub_api (class HubApi): HubApi instance.
+        max_limit (int): Max number of objects.
+        is_recursive (bool): Whether to list objects recursively.
+        dataset_name (str): Dataset name.
+        namespace (str): Namespace.
+        version (str): Dataset version.
+    Returns:
+        res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
+    """
+    res = []
+    cookies = hub_api.check_cookies_upload_data(use_cookies=True)
+    objects = hub_api.list_oss_dataset_objects(
+        dataset_name=dataset_name,
+        namespace=namespace,
+        max_limit=max_limit,
+        is_recursive=is_recursive,
+        is_filter_dir=True,
+        revision=version,
+        cookies=cookies)
+
+    for item in objects:
+        object_key = item.get('Key')
+        res.append(object_key)
+
+    return res
+
+
+def contains_dir(file_map) -> bool:
+    """
+    To check whether input contains at least one directory.
+
+    Args:
+        file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'}
+    Returns:
+        True if input contains at least one directory, False otherwise.
+    """
+    res = False
+    for k, v in file_map.items():
+        if isinstance(v, str) and not v.endswith('.zip'):
+            res = True
+            break
+    return res
+
+
+def get_split_objects_map(file_map, objects):
+    """
+    Get the map between dataset split and oss objects.
+
+    Args:
+        file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val
+            are dirs.
+        objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png']
+    Returns:
+        A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'],
+            'validation':['val/003/3_38.png']}
+    """
+    res = {}
+    for k, v in file_map.items():
+        res[k] = []
+
+    for obj_key in objects:
+        for k, v in file_map.items():
+            if obj_key.startswith(v):
+                res[k].append(obj_key)
+
+    return res
+
+
 def get_dataset_files(subset_split_into: dict,
                       dataset_name: str,
                       namespace: str,
@@ -95,14 +171,24 @@ def get_dataset_files(subset_split_into: dict,
     meta_map = defaultdict(dict)
     file_map = defaultdict(dict)
     args_map = defaultdict(dict)
-    from modelscope.hub.api import HubApi
     modelscope_api = HubApi()
+    objects = list_dataset_objects(
+        hub_api=modelscope_api,
+        max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value,
+        is_recursive=True,
+        dataset_name=dataset_name,
+        namespace=namespace,
+        version=revision)
+
     for split, info in subset_split_into.items():
         meta_map[split] = modelscope_api.get_dataset_file_url(
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
         args_map[split] = info.get('args')
+
+    if contains_dir(file_map):
+        file_map = get_split_objects_map(file_map, objects)
     return meta_map, file_map, args_map
 
 
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index 2e21bf50..b1c7a5ab 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -10,16 +10,14 @@ from .oss_utils import OssUtilities
 
 class DatasetDownloadManager(DownloadManager):
 
-    def __init__(
-        self,
-        dataset_name: str,
-        namespace: str,
-        version: str,
-        data_dir: Optional[str] = None,
-        download_config: Optional[DownloadConfig] = None,
-        base_path: Optional[str] = None,
-        record_checksums=True,
-    ):
+    def __init__(self,
+                 dataset_name: str,
+                 namespace: str,
+                 version: str,
+                 data_dir: Optional[str] = None,
+                 download_config: Optional[DownloadConfig] = None,
+                 base_path: Optional[str] = None,
+                 record_checksums=True):
         super().__init__(dataset_name, data_dir, download_config, base_path,
                          record_checksums)
         self._namespace = namespace
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 4a403876..d7d61e89 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -50,11 +50,16 @@ class OssUtilities:
                 progress_callback=self._percentage)
         return local_path
 
-    def upload(self, oss_object_name: str, local_file_path: str) -> str:
+    def upload(self, oss_object_name: str, local_file_path: str,
+               indicate_individual_progress: bool) -> str:
         retry_count = 0
         object_key = os.path.join(self.oss_dir, oss_object_name)
         resumable_store = oss2.ResumableStore(
             root=self.upload_resumable_tmp_store)
+        if indicate_individual_progress:
+            progress_callback = self._percentage
+        else:
+            progress_callback = None
 
         while True:
             try:
@@ -66,7 +71,7 @@ class OssUtilities:
                     store=resumable_store,
                     multipart_threshold=self.upload_multipart_threshold,
                     part_size=self.upload_part_size,
-                    progress_callback=self._percentage,
+                    progress_callback=progress_callback,
                     num_threads=self.upload_num_threads)
                 break
             except Exception:
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index 4813b89f..2b4422b2 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,5 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
+from multiprocessing.dummy import Pool as ThreadPool
+
+from tqdm import tqdm
+
 from .oss_utils import OssUtilities
 
 
@@ -19,5 +24,38 @@ class DatasetUploadManager(object):
 
     def upload(self, object_name: str, local_file_path: str) -> str:
         object_key = self.oss_utilities.upload(
-            oss_object_name=object_name, local_file_path=local_file_path)
+            oss_object_name=object_name,
+            local_file_path=local_file_path,
+            indicate_individual_progress=True)
         return object_key
+
+    def upload_dir(self, object_dir_name: str, local_dir_path: str,
+                   num_processes: int, chunksize: int,
+                   filter_hidden_files: bool) -> int:
+
+        def run_upload(args):
+            self.oss_utilities.upload(
+                oss_object_name=args[0],
+                local_file_path=args[1],
+                indicate_individual_progress=False)
+
+        files_list = []
+        for root, dirs, files in os.walk(local_dir_path):
+            for file_name in files:
+                if filter_hidden_files and file_name.startswith('.'):
+                    continue
+                # Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png
+                object_name = os.path.join(
+                    object_dir_name,
+                    root.replace(local_dir_path, '', 1).strip('/'), file_name)
+
+                local_file_path = os.path.join(root, file_name)
+                files_list.append((object_name, local_file_path))
+
+        with ThreadPool(processes=num_processes) as pool:
+            result = list(
+                tqdm(
+                    pool.imap(run_upload, files_list, chunksize=chunksize),
+                    total=len(files_list)))
+
+        return len(result)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 5f0532ce..9e10e802 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -227,6 +227,13 @@ class DownloadMode(enum.Enum):
     FORCE_REDOWNLOAD = 'force_redownload'
 
 
+class DownloadParams(enum.Enum):
+    """
+        Parameters for downloading dataset.
+    """
+    MAX_LIST_OBJECTS_NUM = 50000
+
+
 class DatasetFormations(enum.Enum):
     """ How a dataset is organized and interpreted
     """
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 1179414d..3d35d480 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -6,9 +6,13 @@ import unittest
 import zipfile
 
 from modelscope.msdatasets import MsDataset
-from modelscope.utils.constant import ModelFile
+from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile
 from modelscope.utils.test_utils import test_level
 
+logger = logging.get_logger(__name__)
+
 KEY_EXTRACTED = 'extracted'
 
 
@@ -39,7 +43,8 @@ class DatasetUploadTest(unittest.TestCase):
     def tearDown(self):
         os.chdir(self.old_dir)
         shutil.rmtree(self.temp_dir, ignore_errors=True)
-        print('The test dir successfully removed!')
+        logger.info(
+            f'Temporary directory {self.temp_dir} successfully removed!')
 
     @staticmethod
     def get_raw_downloaded_file_path(extracted_path):
@@ -68,6 +73,40 @@ class DatasetUploadTest(unittest.TestCase):
             dataset_name=self.dataset_name,
             namespace=self.namespace)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload_dir(self):
+        ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
+        config_train = ms_ds_train._hf_ds.config_kwargs
+        extracted_path_train = config_train.get('split_config').get('train')
+
+        MsDataset.upload(
+            object_name='train',
+            local_file_path=os.path.join(extracted_path_train,
+                                         'Pets/images/train'),
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+        MsDataset.upload(
+            object_name='val',
+            local_file_path=os.path.join(extracted_path_train,
+                                         'Pets/images/val'),
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+
+        objects = list_dataset_objects(
+            hub_api=self.api,
+            max_limit=-1,
+            is_recursive=True,
+            dataset_name=self.dataset_name,
+            namespace=self.namespace,
+            version=DEFAULT_DATASET_REVISION)
+
+        logger.info(f'{len(objects)} objects have been uploaded: {objects}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_download_dir(self):
+        test_ds = MsDataset.load(self.dataset_name, self.namespace)
+        assert test_ds.config_kwargs['split_config'].values()
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_clone_meta(self):
         MsDataset.clone_meta(

From deb847614a518537a22567209519dbea89feabcd Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 14 Oct 2022 21:59:52 +0800
Subject: [PATCH 40/57] [to #42322933] limit espnet version

---
 requirements/audio.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/audio.txt b/requirements/audio.txt
index 742cf166..bef32121 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,5 +1,5 @@
 easyasr>=0.0.2
-espnet>=202204
+espnet==202204
 h5py
 inflect
 keras

From 202fcdf2984a214e8d4a55b11607eefafa77af0f Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Fri, 14 Oct 2022 23:11:19 +0800
Subject: [PATCH 41/57] [to #42322933] change tableqa output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修改output的结构，直接返回可转化成json format的结构
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10415403
---
 .../models/nlp/table_question_answering.py     |  6 +++---
 modelscope/outputs.py                          |  7 ++++++-
 .../nlp/table_question_answering_pipeline.py   |  3 ++-
 .../pipelines/test_table_question_answering.py | 18 +++++++++++-------
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index c6a03ef3..c2134df2 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -691,11 +691,11 @@ class TableQuestionAnswering(Model):
                         sels.append(l_hs[ib] - 1)
                         aggs.append(sql['agg'][ia])
                     continue
-                sels.append(sel)
+                sels.append(int(sel))
                 if sql['agg'][ia] == -1:
                     aggs.append(0)
                 else:
-                    aggs.append(sql['agg'][ia])
+                    aggs.append(int(sql['agg'][ia]))
             if len(sels) == 0:
                 sels.append(l_hs[ib] - 1)
                 aggs.append(0)
@@ -712,7 +712,7 @@ class TableQuestionAnswering(Model):
             for i in range(wl):
                 if wc_os[i] == -1:
                     continue
-                conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]])
+                conds.append([int(wc_os[i]), int(wo_os[i]), pr_wvi_str[ib][i]])
             if len(conds) == 0:
                 conds.append([l_hs[ib] - 1, 2, 'Nulll'])
             sql['conds'] = conds
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 3001c03c..c08779b4 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -36,6 +36,8 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    SQL_STRING = 'sql_string'
+    SQL_QUERY = 'sql_query'
     HISTORY = 'history'
     QUERT_RESULT = 'query_result'
     TIMESTAMPS = 'timestamps'
@@ -583,7 +585,10 @@ TASK_OUTPUTS = {
     #   "sql": "SELECT shop.Name FROM shop."
     #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
     # }
-    Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
+    Tasks.table_question_answering: [
+        OutputKeys.SQL_STRING, OutputKeys.SQL_QUERY, OutputKeys.HISTORY,
+        OutputKeys.QUERT_RESULT
+    ],
 
     # ============ audio tasks ===================
     # asr result for single sample
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index e1b2b07b..ca17c9b1 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -311,7 +311,8 @@ class TableQuestionAnsweringPipeline(Pipeline):
             tabledata = {'headers': [], 'cells': []}
 
         output = {
-            OutputKeys.OUTPUT: sql,
+            OutputKeys.SQL_STRING: sql.string,
+            OutputKeys.SQL_QUERY: sql.query,
             OutputKeys.HISTORY: result['sql'],
             OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
         }
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 68e0564f..3d943e51 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -3,10 +3,12 @@ import os
 import unittest
 from typing import List
 
+import json
 from transformers import BertTokenizer
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
@@ -38,11 +40,12 @@ def tableqa_tracking_and_print_results_with_history(
                 'history_sql': historical_queries
             })
             print('question', question)
-            print('sql text:', output_dict['output'].string)
-            print('sql query:', output_dict['output'].query)
-            print('query result:', output_dict['query_result'])
+            print('sql text:', output_dict[OutputKeys.SQL_STRING])
+            print('sql query:', output_dict[OutputKeys.SQL_QUERY])
+            print('query result:', output_dict[OutputKeys.QUERT_RESULT])
+            print('json dumps', json.dumps(output_dict))
             print()
-            historical_queries = output_dict['history']
+            historical_queries = output_dict[OutputKeys.HISTORY]
 
 
 def tableqa_tracking_and_print_results_without_history(
@@ -60,9 +63,10 @@ def tableqa_tracking_and_print_results_without_history(
         for question in test_case['utterance']:
             output_dict = p({'question': question})
             print('question', question)
-            print('sql text:', output_dict['output'].string)
-            print('sql query:', output_dict['output'].query)
-            print('query result:', output_dict['query_result'])
+            print('sql text:', output_dict[OutputKeys.SQL_STRING])
+            print('sql query:', output_dict[OutputKeys.SQL_QUERY])
+            print('query result:', output_dict[OutputKeys.QUERT_RESULT])
+            print('json dumps', json.dumps(output_dict))
             print()
 
 
From 7e7303a658fae50a027420547035bb7319c64c76 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Sat, 15 Oct 2022 08:51:06 +0800
Subject: [PATCH 42/57] [to #42322933] remove fasttext from nlp requirements

---
 modelscope/utils/error.py        | 9 +++++++++
 modelscope/utils/import_utils.py | 1 +
 requirements/nlp.txt             | 3 +--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index a6bbc8b3..a894063c 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -111,3 +111,12 @@ You can install it with pip on linux:
 On windows, please checkout the instructions on the
 installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment.
 """
+
+# docstyle-ignore
+FASTTEXT_IMPORT_ERROR = """
+{0} requires the fasttext library but it was not found in your environment.
+You can install it with pip on linux or mac:
+`pip install fasttext`
+Or you can checkout the instructions on the
+installation page: https://github.com/facebookresearch/fastText and follow the ones that match your environment.
+"""
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 2a6fdc80..5db5ea98 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -292,6 +292,7 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)),
     ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
     ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
+    ('fasttext', (is_package_available('fasttext'), FASTTEXT_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 123c238e..a5f3cbd9 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,5 +1,4 @@
 en_core_web_sm>=2.3.5
-fasttext
 jieba>=0.42.1
 megatron_util
 pai-easynlp
@@ -14,4 +13,4 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-transformers
+transformers>=4.12.0

From 4682783619f86726d0bb3c880c2100e91d126355 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 15 Oct 2022 20:33:55 +0800
Subject: [PATCH 43/57] [to #44902165] bump version to 0.5.0

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 1e4826d6..2b8877c5 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.7'
+__version__ = '0.5.0'

From f6e542cdcb6c1a1be690750bebda791ed5c90589 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 17 Oct 2022 10:40:08 +0800
Subject: [PATCH 44/57] refine pipeline input to support demo service

* image_captioninig support single image and dict input
* image_style_transfer use dict input

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10417330
---
 modelscope/pipeline_inputs.py                | 16 ++++++++++------
 modelscope/pipelines/base.py                 |  6 +++++-
 tests/pipelines/test_image_style_transfer.py | 15 +++++++++------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 2b14c278..34b731c6 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -97,8 +97,10 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.image_to_image_translation:
     InputType.IMAGE,
-    Tasks.image_style_transfer:
-    InputType.IMAGE,
+    Tasks.image_style_transfer: {
+        'content': InputType.IMAGE,
+        'style': InputType.IMAGE,
+    },
     Tasks.image_portrait_stylization:
     InputType.IMAGE,
     Tasks.live_category:
@@ -147,8 +149,9 @@ TASK_INPUTS = {
     InputType.TEXT,
     Tasks.translation:
     InputType.TEXT,
-    Tasks.word_segmentation:
-    InputType.TEXT,
+    Tasks.word_segmentation: [InputType.TEXT, {
+        'text': InputType.TEXT,
+    }],
     Tasks.part_of_speech:
     InputType.TEXT,
     Tasks.named_entity_recognition:
@@ -194,8 +197,9 @@ TASK_INPUTS = {
     InputType.AUDIO,
 
     # ============ multi-modal tasks ===================
-    Tasks.image_captioning:
-    InputType.IMAGE,
+    Tasks.image_captioning: [InputType.IMAGE, {
+        'image': InputType.IMAGE,
+    }],
     Tasks.visual_grounding: {
         'image': InputType.IMAGE,
         'text': InputType.TEXT
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 5732a9d7..ea329be4 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -236,7 +236,11 @@ class Pipeline(ABC):
             if isinstance(input_type, list):
                 matched_type = None
                 for t in input_type:
-                    if type(t) == type(input):
+                    if isinstance(input, (dict, tuple)):
+                        if type(t) == type(input):
+                            matched_type = t
+                            break
+                    elif isinstance(t, str):
                         matched_type = t
                         break
                 if matched_type is None:
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index a02d5308..5f37f204 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -25,8 +25,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_style_transfer, model=snapshot_path)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer1.png', result[OutputKeys.OUTPUT_IMG])
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -35,8 +36,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_style_transfer, model=self.model_id)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer2.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub done')
 
@@ -45,8 +47,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
         image_style_transfer = pipeline(Tasks.image_style_transfer)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 

From 88a7599efb0168cd1914e19d0990ab1b37fc7406 Mon Sep 17 00:00:00 2001
From: "wenqi.oywq" <wenqi.oywq@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:05:12 +0800
Subject: [PATCH 45/57] [to #42322933]change output channels from RGB to BGR,
 to consistent with demo-service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

默认输出为array的，通道格式统一为BGR格式，本次修改是为了与这个格式一致
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10422508
---
 modelscope/pipelines/cv/image_color_enhance_pipeline.py | 2 +-
 tests/pipelines/test_image_color_enhance.py             | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
index d21d879c..3a4cf8bc 100644
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -55,5 +55,5 @@ class ImageColorEnhancePipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         output_img = (inputs['outputs'].squeeze(0) * 255.).type(
-            torch.uint8).cpu().permute(1, 2, 0).numpy()
+            torch.uint8).cpu().permute(1, 2, 0).numpy()[:, :, ::-1]
         return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py
index 9b72999e..7c3ae8c0 100644
--- a/tests/pipelines/test_image_color_enhance.py
+++ b/tests/pipelines/test_image_color_enhance.py
@@ -21,8 +21,7 @@ class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
         if result is not None:
-            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG][:, :,
-                                                                    [2, 1, 0]])
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
             print(f'Output written to {osp.abspath("result.png")}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')

From 674e1a7878f63603aa3bbc669fbac6a8b8a5b8a5 Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:06:07 +0800
Subject: [PATCH 46/57] [to #42322933]cv/cvdet_fix_outputs->master fix outputs 
        Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10421413

    * fix outputs
---
 .../pipelines/cv/image_detection_pipeline.py  |  8 ++++++--
 tests/pipelines/test_object_detection.py      | 20 ++++---------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py
index f5554ca2..08633c35 100644
--- a/modelscope/pipelines/cv/image_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_detection_pipeline.py
@@ -43,11 +43,15 @@ class ImageDetectionPipeline(Pipeline):
 
         bboxes, scores, labels = self.model.postprocess(inputs['data'])
         if bboxes is None:
-            return None
+            outputs = {
+                OutputKeys.SCORES: [],
+                OutputKeys.LABELS: [],
+                OutputKeys.BOXES: []
+            }
+            return outputs
         outputs = {
             OutputKeys.SCORES: scores,
             OutputKeys.LABELS: labels,
             OutputKeys.BOXES: bboxes
         }
-
         return outputs
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 2cb217d9..00a71371 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -19,20 +19,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         model_id = 'damo/cv_vit_object-detection_coco'
         object_detect = pipeline(Tasks.image_object_detection, model=model_id)
         result = object_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_object_detection_with_default_task(self):
         input_location = 'data/test/images/image_detection.jpg'
         object_detect = pipeline(Tasks.image_object_detection)
         result = object_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_human_detection(self):
@@ -40,20 +34,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         model_id = 'damo/cv_resnet18_human-detection'
         human_detect = pipeline(Tasks.human_detection, model=model_id)
         result = human_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_human_detection_with_default_task(self):
         input_location = 'data/test/images/image_detection.jpg'
         human_detect = pipeline(Tasks.human_detection)
         result = human_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 542c4ce1b3433ca1d51ac0c0349b3d8f87c51f41 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:07:05 +0800
Subject: [PATCH 47/57] [to #42322933] Fix bug in KWS when setting customized
 keyword         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10412829

---
 .../audio/asr/generic_automatic_speech_recognition.py    | 2 ++
 modelscope/models/audio/kws/generic_key_word_spotting.py | 2 ++
 modelscope/pipelines/audio/kws_kwsbp_pipeline.py         | 9 +++++++++
 modelscope/preprocessors/asr.py                          | 2 ++
 modelscope/preprocessors/kws.py                          | 2 ++
 tests/pipelines/test_key_word_spotting.py                | 2 +-
 6 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
index 11accf0a..aebc6751 100644
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/audio/kws/generic_key_word_spotting.py b/modelscope/models/audio/kws/generic_key_word_spotting.py
index c1b7a0e4..2f70327d 100644
--- a/modelscope/models/audio/kws/generic_key_word_spotting.py
+++ b/modelscope/models/audio/kws/generic_key_word_spotting.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 450a12bb..5555c9e6 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -37,6 +37,12 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
                  **kwargs) -> Dict[str, Any]:
         if 'keywords' in kwargs.keys():
             self.keywords = kwargs['keywords']
+            if isinstance(self.keywords, str):
+                word_list = []
+                word = {}
+                word['keyword'] = self.keywords
+                word_list.append(word)
+                self.keywords = word_list
         else:
             self.keywords = None
 
@@ -96,6 +102,9 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
             pos_list=pos_kws_list,
             neg_list=neg_kws_list)
 
+        if 'kws_list' not in rst_dict:
+            rst_dict['kws_list'] = []
+
         return rst_dict
 
     def run_with_kwsbp(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py
index d58383d7..facaa132 100644
--- a/modelscope/preprocessors/asr.py
+++ b/modelscope/preprocessors/asr.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, List, Union
 
diff --git a/modelscope/preprocessors/kws.py b/modelscope/preprocessors/kws.py
index 9c370ed5..6f09d545 100644
--- a/modelscope/preprocessors/kws.py
+++ b/modelscope/preprocessors/kws.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, List, Union
 
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 91f9f566..f31d212b 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -245,7 +245,7 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_by_customized_keywords(self):
-        keywords = [{'keyword': '播放音乐'}]
+        keywords = '播放音乐'
 
         kws_result = self.run_pipeline(
             model_id=self.model_id,

From 8fa385e27cc9a949c8544e838a6250ab527b0685 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Mon, 17 Oct 2022 15:42:24 +0800
Subject: [PATCH 48/57] [to #42322933] Add upload in hub api

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10386689
---
 modelscope/hub/git.py        |  10 +++
 modelscope/hub/upload.py     | 117 +++++++++++++++++++++++++
 tests/hub/test_hub_upload.py | 164 +++++++++++++++++++++++++++++++++++
 3 files changed, 291 insertions(+)
 create mode 100644 modelscope/hub/upload.py
 create mode 100644 tests/hub/test_hub_upload.py

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index a149ede1..db76506e 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import re
 import subprocess
 from typing import List
 from xmlrpc.client import Boolean
@@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton):
         cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision]
         return self._run_git_command(*cmds)
 
+    def get_remote_branches(self, repo_dir: str):
+        cmds = ['-C', '%s' % repo_dir, 'branch', '-r']
+        rsp = self._run_git_command(*cmds)
+        info = [
+            line.strip()
+            for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
+        ][1:]
+        return ['/'.join(line.split('/')[1:]) for line in info]
+
     def pull(self, repo_dir: str):
         cmds = ['-C', repo_dir, 'pull']
         return self._run_git_command(*cmds)
diff --git a/modelscope/hub/upload.py b/modelscope/hub/upload.py
new file mode 100644
index 00000000..9dffc60e
--- /dev/null
+++ b/modelscope/hub/upload.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import datetime
+import os
+import shutil
+import tempfile
+import uuid
+from typing import Dict, Optional
+from uuid import uuid4
+
+from filelock import FileLock
+
+from modelscope import __version__
+from modelscope.hub.api import HubApi, ModelScopeConfig
+from modelscope.hub.errors import InvalidParameter, NotLoginException
+from modelscope.hub.git import GitCommandWrapper
+from modelscope.hub.repository import Repository
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def upload_folder(model_id: str,
+                  model_dir: str,
+                  visibility: int = 0,
+                  license: str = None,
+                  chinese_name: Optional[str] = None,
+                  commit_message: Optional[str] = None,
+                  revision: Optional[str] = DEFAULT_MODEL_REVISION):
+    """
+    Upload model from a given directory to given repository. A valid model directory
+    must contain a configuration.json file.
+
+    This function upload the files in given directory to given repository. If the
+    given repository is not exists in remote, it will automatically create it with
+    given visibility, license and chinese_name parameters. If the revision is also
+    not exists in remote repository, it will create a new branch for it.
+
+    This function must be called before calling HubApi's login with a valid token
+    which can be obtained from ModelScope's website.
+
+    Args:
+        model_id (`str`):
+            The model id to be uploaded, caller must have write permission for it.
+        model_dir(`str`):
+            The Absolute Path of the finetune result.
+        visibility(`int`, defaults to `0`):
+            Visibility of the new created model(1-private, 5-public). If the model is
+            not exists in ModelScope, this function will create a new model with this
+            visibility and this parameter is required. You can ignore this parameter
+            if you make sure the model's existence.
+        license(`str`, defaults to `None`):
+            License of the new created model(see License). If the model is not exists
+            in ModelScope, this function will create a new model with this license
+            and this parameter is required. You can ignore this parameter if you
+            make sure the model's existence.
+        chinese_name(`str`, *optional*, defaults to `None`):
+            chinese name of the new created model.
+        commit_message(`str`, *optional*, defaults to `None`):
+            commit message of the push request.
+        revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
+            which branch to push. If the branch is not exists, It will create a new
+            branch and push to it.
+    """
+    if model_id is None:
+        raise InvalidParameter('model_id cannot be empty!')
+    if model_dir is None:
+        raise InvalidParameter('model_dir cannot be empty!')
+    if not os.path.exists(model_dir) or os.path.isfile(model_dir):
+        raise InvalidParameter('model_dir must be a valid directory.')
+    cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+    if not os.path.exists(cfg_file):
+        raise ValueError(f'{model_dir} must contain a configuration.json.')
+    cookies = ModelScopeConfig.get_cookies()
+    if cookies is None:
+        raise NotLoginException('Must login before upload!')
+    files_to_save = os.listdir(model_dir)
+    api = HubApi()
+    try:
+        api.get_model(model_id=model_id)
+    except Exception:
+        if visibility is None or license is None:
+            raise InvalidParameter(
+                'visibility and license cannot be empty if want to create new repo'
+            )
+        logger.info('Create new model %s' % model_id)
+        api.create_model(
+            model_id=model_id,
+            visibility=visibility,
+            license=license,
+            chinese_name=chinese_name)
+    tmp_dir = tempfile.mkdtemp()
+    git_wrapper = GitCommandWrapper()
+    try:
+        repo = Repository(model_dir=tmp_dir, clone_from=model_id)
+        branches = git_wrapper.get_remote_branches(tmp_dir)
+        if revision not in branches:
+            logger.info('Create new branch %s' % revision)
+            git_wrapper.new_branch(tmp_dir, revision)
+        git_wrapper.checkout(tmp_dir, revision)
+        for f in files_to_save:
+            if f[0] != '.':
+                src = os.path.join(model_dir, f)
+                if os.path.isdir(src):
+                    shutil.copytree(src, os.path.join(tmp_dir, f))
+                else:
+                    shutil.copy(src, tmp_dir)
+        if not commit_message:
+            date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
+            commit_message = '[automsg] push model %s to hub at %s' % (
+                model_id, date)
+        repo.push(commit_message=commit_message, branch=revision)
+    except Exception:
+        raise
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
new file mode 100644
index 00000000..d7e6e439
--- /dev/null
+++ b/tests/hub/test_hub_upload.py
@@ -0,0 +1,164 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.repository import Repository
+from modelscope.hub.upload import upload_folder
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+from .test_utils import TEST_ACCESS_TOKEN1, delete_credential
+
+logger = get_logger()
+
+
+class HubUploadTest(unittest.TestCase):
+
+    def setUp(self):
+        logger.info('SetUp')
+        self.api = HubApi()
+        self.user = os.environ.get('TEST_MODEL_ORG', 'citest')
+        logger.info(self.user)
+        self.create_model_name = '%s/%s' % (self.user, 'test_model_upload')
+        temporary_dir = tempfile.mkdtemp()
+        self.work_dir = temporary_dir
+        self.model_dir = os.path.join(temporary_dir, self.create_model_name)
+        self.finetune_path = os.path.join(self.work_dir, 'finetune_path')
+        self.repo_path = os.path.join(self.work_dir, 'repo_path')
+        os.mkdir(self.finetune_path)
+        os.system("echo '{}'>%s"
+                  % os.path.join(self.finetune_path, ModelFile.CONFIGURATION))
+
+    def tearDown(self):
+        logger.info('TearDown')
+        shutil.rmtree(self.model_dir, ignore_errors=True)
+        self.api.delete_model(model_id=self.create_model_name)
+
+    def test_upload_exits_repo_master(self):
+        logger.info('basic test for upload!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        self.api.create_model(
+            model_id=self.create_model_name,
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2)
+        os.system("echo '111'>%s"
+                  % os.path.join(self.finetune_path, 'add1.py'))
+        upload_folder(
+            model_id=self.create_model_name, model_dir=self.finetune_path)
+        Repository(model_dir=self.repo_path, clone_from=self.create_model_name)
+        assert os.path.exists(os.path.join(self.repo_path, 'add1.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        os.system("echo '222'>%s"
+                  % os.path.join(self.finetune_path, 'add2.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version1')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version1')
+        assert os.path.exists(os.path.join(self.repo_path, 'add2.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        os.system("echo '333'>%s"
+                  % os.path.join(self.finetune_path, 'add3.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version2',
+            commit_message='add add3.py')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version2')
+        assert os.path.exists(os.path.join(self.repo_path, 'add2.py'))
+        assert os.path.exists(os.path.join(self.repo_path, 'add3.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        add4_path = os.path.join(self.finetune_path, 'temp')
+        os.mkdir(add4_path)
+        os.system("echo '444'>%s" % os.path.join(add4_path, 'add4.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version1')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version1')
+        assert os.path.exists(os.path.join(add4_path, 'add4.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_non_exists_repo(self):
+        logger.info('test upload non exists repo!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        os.system("echo '111'>%s"
+                  % os.path.join(self.finetune_path, 'add1.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_model_new_revision',
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2)
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_model_new_revision')
+        assert os.path.exists(os.path.join(self.repo_path, 'add1.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_without_token(self):
+        logger.info('test upload without login!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        delete_credential()
+        try:
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+        except Exception as e:
+            logger.info(e)
+            self.api.login(TEST_ACCESS_TOKEN1)
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+            Repository(
+                model_dir=self.repo_path, clone_from=self.create_model_name)
+            assert os.path.exists(
+                os.path.join(self.repo_path, 'configuration.json'))
+            shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_invalid_repo(self):
+        logger.info('test upload to invalid repo!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        try:
+            upload_folder(
+                model_id='%s/%s' % ('speech_tts', 'invalid_model_test'),
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+        except Exception as e:
+            logger.info(e)
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+            Repository(
+                model_dir=self.repo_path, clone_from=self.create_model_name)
+            assert os.path.exists(
+                os.path.join(self.repo_path, 'configuration.json'))
+            shutil.rmtree(self.repo_path, ignore_errors=True)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7720ae50e241ed3a5cf319d9410b774228d8126c Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:30:42 +0800
Subject: [PATCH 49/57] return dict values when input single sample for easycv
 pipeline

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10423383
---
 .../pipelines/cv/easycv_pipelines/base.py      | 18 +++++++++++++++++-
 .../cv/easycv_pipelines/detection_pipeline.py  |  3 +++
 .../face_2d_keypoints_pipeline.py              |  3 +++
 .../human_wholebody_keypoint_pipeline.py       |  3 +++
 tests/pipelines/test_face_2d_keypoints.py      |  2 +-
 tests/pipelines/test_hand_2d_keypoints.py      |  9 ++-------
 .../pipelines/test_human_wholebody_keypoint.py |  2 +-
 tests/pipelines/test_object_detection.py       |  2 +-
 8 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
index 8aea1146..c130aea0 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -4,7 +4,9 @@ import os
 import os.path as osp
 from typing import Any
 
+import numpy as np
 from easycv.utils.ms_utils import EasyCVMeta
+from PIL import ImageFile
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
@@ -94,5 +96,19 @@ class EasyCVPipeline(object):
 
         return easycv_config
 
+    def _is_single_inputs(self, inputs):
+        if isinstance(inputs, str) or (isinstance(inputs, list)
+                                       and len(inputs) == 1) or isinstance(
+                                           inputs, np.ndarray) or isinstance(
+                                               inputs, ImageFile.ImageFile):
+            return True
+
+        return False
+
     def __call__(self, inputs) -> Any:
-        return self.predict_op(inputs)
+        outputs = self.predict_op(inputs)
+
+        if self._is_single_inputs(inputs):
+            outputs = outputs[0]
+
+        return outputs
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
index 0c2058d5..a1173bc4 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -57,4 +57,7 @@ class EasyCVDetectionPipeline(EasyCVPipeline):
             OutputKeys.BOXES: boxes
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index 7c32e0fc..b48d013e 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -40,4 +40,7 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
             OutputKeys.POSES: output['pose']
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
index 263f8225..936accbf 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
@@ -62,4 +62,7 @@ class HumanWholebodyKeypointsPipeline(EasyCVPipeline):
             OutputKeys.BOXES: output['boxes']
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
index 667ecddc..a5e347e8 100644
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         face_2d_keypoints_align = pipeline(
             task=Tasks.face_2d_keypoints, model=model_id)
-        output = face_2d_keypoints_align(img_path)[0]
+        output = face_2d_keypoints_align(img_path)
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.POSES]
diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py
index 86cd2d06..43b569d0 100644
--- a/tests/pipelines/test_hand_2d_keypoints.py
+++ b/tests/pipelines/test_hand_2d_keypoints.py
@@ -15,10 +15,8 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase):
         model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
 
         hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id)
-        outputs = hand_keypoint(img_path)
-        self.assertEqual(len(outputs), 1)
+        results = hand_keypoint(img_path)
 
-        results = outputs[0]
         self.assertIn(OutputKeys.KEYPOINTS, results.keys())
         self.assertIn(OutputKeys.BOXES, results.keys())
         self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
@@ -30,10 +28,7 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase):
         img_path = 'data/test/images/hand_keypoints.jpg'
 
         hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints)
-        outputs = hand_keypoint(img_path)
-        self.assertEqual(len(outputs), 1)
-
-        results = outputs[0]
+        results = hand_keypoint(img_path)
         self.assertIn(OutputKeys.KEYPOINTS, results.keys())
         self.assertIn(OutputKeys.BOXES, results.keys())
         self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py
index b214f4e1..7c5946cc 100644
--- a/tests/pipelines/test_human_wholebody_keypoint.py
+++ b/tests/pipelines/test_human_wholebody_keypoint.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         human_wholebody_keypoint_pipeline = pipeline(
             task=Tasks.human_wholebody_keypoint, model=model_id)
-        output = human_wholebody_keypoint_pipeline(img_path)[0]
+        output = human_wholebody_keypoint_pipeline(img_path)
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.BOXES]
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 00a71371..64766c77 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -55,7 +55,7 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         image_object_detection_auto = pipeline(
             Tasks.image_object_detection, model=model_id)
 
-        result = image_object_detection_auto(test_image)[0]
+        result = image_object_detection_auto(test_image)
         image_object_detection_auto.show_result(test_image, result,
                                                 'auto_demo_ret.jpg')
 

From ac07b719e9b83c5da6c108e75e0767211f343016 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 17 Oct 2022 20:51:58 +0800
Subject: [PATCH 50/57] [to #45546922]feat: add fasttext package         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10431169

    * [to #45546922]feat: add fasttext package
---
 docker/Dockerfile.ubuntu | 2 +-
 modelscope/hub/errors.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index a9a409b5..6dafbc3e 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index fb483287..bd7a20ac 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -53,8 +53,8 @@ def handle_http_post_error(response, url, request_body):
     try:
         response.raise_for_status()
     except HTTPError as error:
-        logger.error('Request %s with body: %s exception, respoonse body: %s' %
-                     (url, request_body, response.body))
+        logger.error('Request %s with body: %s exception' %
+                     (url, request_body))
         raise error
 
 
From 271e2a2a9916de3bd64e40dd4c836d341fed4b77 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:54:29 +0800
Subject: [PATCH 51/57] [to #42322933] Add gpt_neo model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 添加 gpt_neo 模型，因 checkpoint 归属于 Langboat 还未上传到模型库，已线下完成测试
2. 添加 text-generation task models 与 head，后续会将 gpt3，palm 等已上线文本生成模型统一为 backbone + head 结构的 task models
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10404249
---
 modelscope/metainfo.py                        |  5 ++
 modelscope/models/nlp/__init__.py             |  4 +-
 modelscope/models/nlp/backbones/gpt_neo.py    | 15 ++++
 .../models/nlp/heads/text_generation_head.py  | 35 ++++++++
 modelscope/models/nlp/task_models/__init__.py |  2 +
 .../models/nlp/task_models/text_generation.py | 79 +++++++++++++++++++
 .../pipelines/nlp/text_generation_pipeline.py | 38 ++++++---
 modelscope/preprocessors/__init__.py          |  2 +
 modelscope/preprocessors/nlp/__init__.py      |  2 +
 modelscope/preprocessors/nlp/nlp_base.py      | 21 +++++
 tests/pipelines/test_text_generation.py       | 13 +++
 tests/utils/test_ast.py                       |  2 +-
 12 files changed, 207 insertions(+), 11 deletions(-)
 create mode 100644 modelscope/models/nlp/backbones/gpt_neo.py
 create mode 100644 modelscope/models/nlp/heads/text_generation_head.py
 create mode 100644 modelscope/models/nlp/task_models/text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 2e3fed98..fb99bc71 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -71,6 +71,7 @@ class Models(object):
     gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    gpt_neo = 'gpt-neo'
     plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
@@ -101,6 +102,7 @@ class TaskModels(object):
     information_extraction = 'information-extraction'
     fill_mask = 'fill-mask'
     feature_extraction = 'feature-extraction'
+    text_generation = 'text-generation'
 
 
 class Heads(object):
@@ -116,6 +118,8 @@ class Heads(object):
     token_classification = 'token-classification'
     # extraction
     information_extraction = 'information-extraction'
+    # text gen
+    text_generation = 'text-generation'
 
 
 class Pipelines(object):
@@ -341,6 +345,7 @@ class Preprocessors(object):
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    sentence_piece = 'sentence-piece'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 8ef96365..9e830d17 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -30,7 +30,8 @@ if TYPE_CHECKING:
                               InformationExtractionModel,
                               SequenceClassificationModel,
                               SingleBackboneTaskModelBase,
-                              TokenClassificationModel)
+                              TokenClassificationModel,
+                              TaskModelForTextGeneration)
     from .token_classification import SbertForTokenClassification
     from .sentence_embedding import SentenceEmbedding
     from .passage_ranking import PassageRanking
@@ -69,6 +70,7 @@ else:
             'SequenceClassificationModel',
             'SingleBackboneTaskModelBase',
             'TokenClassificationModel',
+            'TaskModelForTextGeneration',
         ],
         'token_classification': ['SbertForTokenClassification'],
         'table_question_answering': ['TableQuestionAnswering'],
diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/backbones/gpt_neo.py
new file mode 100644
index 00000000..a2d0c374
--- /dev/null
+++ b/modelscope/models/nlp/backbones/gpt_neo.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import GPTNeoConfig
+from transformers import GPTNeoModel as GPTNeoModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Fields
+
+
+@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo)
+class GPTNeoModel(GPTNeoModelTransform):
+
+    def __init__(self, **kwargs):
+        config = GPTNeoConfig(**kwargs)
+        super().__init__(config)
diff --git a/modelscope/models/nlp/heads/text_generation_head.py b/modelscope/models/nlp/heads/text_generation_head.py
new file mode 100644
index 00000000..606d5a1f
--- /dev/null
+++ b/modelscope/models/nlp/heads/text_generation_head.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.text_generation, module_name=Heads.text_generation)
+class TextGenerationHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        self.linear = nn.Linear(
+            config['hidden_size'], config['vocab_size'], bias=False)
+
+    def get_output_embeddings(self):
+        return self.linear
+
+    def forward(self, inputs=None):
+        logits = self.linear(inputs)
+        return {OutputKeys.LOGITS: logits}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        logits = outputs[OutputKeys.LOGITS]
+        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 90f22aa1..38359044 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
+    from .text_generation import TaskModelForTextGeneration
 
 else:
     _import_structure = {
@@ -19,6 +20,7 @@ else:
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
+        'text_generation': ['TaskModelForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py
new file mode 100644
index 00000000..973198ae
--- /dev/null
+++ b/modelscope/models/nlp/task_models/text_generation.py
@@ -0,0 +1,79 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import addict
+import numpy as np
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TaskModelForTextGeneration']
+
+
+@MODELS.register_module(
+    Tasks.text_generation, module_name=TaskModels.text_generation)
+class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+        if self.config.get('shared_embedding', False):
+            input_embeddings = self.backbone.get_input_embeddings()
+            output_embeddings = self.head.get_output_embeddings()
+            output_embeddings.weight = input_embeddings.weight
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        backbone_outputs = super().forward(input)
+        hidden_states = backbone_outputs[0]
+
+        outputs = self.head.forward(hidden_states)
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        return addict.Dict(outputs)
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        token_type_ids = kwargs.get('token_type_ids', None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get('attention_mask', None)
+        position_ids = kwargs.get('position_ids', None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        return {
+            'input_ids': input_ids,
+            'past_key_values': past,
+            'use_cache': kwargs.get('use_cache'),
+            'position_ids': position_ids,
+            'attention_mask': attention_mask,
+            'token_type_ids': token_type_ids,
+        }
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index ea35763f..ae92f26a 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -6,10 +6,12 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TextGenerationPreprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.hub import read_config
 
 __all__ = ['TextGenerationPipeline']
 
@@ -20,7 +22,7 @@ class TextGenerationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
-                 preprocessor: Optional[TextGenerationPreprocessor] = None,
+                 preprocessor: Optional[Preprocessor] = None,
                  first_sequence='sentence',
                  **kwargs):
         """Use `model` and `preprocessor` to create a generation pipeline for prediction.
@@ -50,19 +52,34 @@ class TextGenerationPipeline(Pipeline):
         """
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
+        cfg = read_config(model.model_dir)
+        self.postprocessor = cfg.pop('postprocessor', None)
         if preprocessor is None:
-            preprocessor = TextGenerationPreprocessor(
+            preprocessor_cfg = cfg.preprocessor
+            preprocessor_cfg.update({
+                'model_dir':
                 model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=None,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                'first_sequence':
+                first_sequence,
+                'second_sequence':
+                None,
+                'sequence_length':
+                kwargs.pop('sequence_length', 128)
+            })
+            preprocessor = build_preprocessor(preprocessor_cfg, Fields.nlp)
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model.generate(inputs)
+            return self.model.generate(inputs, **forward_params)
+
+    def sentence_piece(self, inputs) -> Dict[str, Tensor]:
+        return self.preprocessor.tokenizer.decode(inputs.tolist())[0]
 
     def postprocess(self, inputs: Dict[str, Tensor],
                     **postprocess_params) -> Dict[str, str]:
@@ -74,4 +91,7 @@ class TextGenerationPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        return inputs
+        return inputs if self.postprocessor is None else {
+            OutputKeys.TEXT:
+            getattr(self, self.postprocessor.replace('-', '_'))(inputs)
+        }
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 90303b65..43fa64a7 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        SentencePiecePreprocessor,
     )
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -71,6 +72,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'SentencePiecePreprocessor',
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index dfbb5c81..a753fe6c 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        SentencePiecePreprocessor,
     )
 
 else:
@@ -41,6 +42,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'SentencePiecePreprocessor',
         ],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index bec7e4e1..3d708634 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -5,6 +5,7 @@ import re
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
+import sentencepiece as spm
 import torch
 from transformers import AutoTokenizer
 
@@ -1160,3 +1161,23 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
 
         self.labels_to_id(labels, output)
         return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_piece)
+class SentencePiecePreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        import os
+
+        super().__init__(*args, **kwargs)
+        self.tokenizer = None
+        for file_name in os.listdir(model_dir):
+            if file_name.endswith('.model'):
+                m_file = osp.join(model_dir, file_name)
+                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
+                break
+        assert self.tokenizer is not None, 'Can not find .model file'
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 66f9c9da..5a270f83 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -133,6 +133,19 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_demo_compatibility(self):
         self.compatibility_check()
 
+    @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub")
+    def test_gpt_neo(self):
+        pipe = pipeline(
+            task=Tasks.text_generation, model='Langboat/mengzi-gpt-neo-base')
+        print(
+            pipe(
+                '我是',
+                do_sample=True,
+                top_k=5,
+                top_p=1,
+                max_length=20,
+                repetition_penalty=0.5))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index 9a8ab828..c0624679 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -41,7 +41,7 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
         self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
-        self.assertEqual(len(from_imports.keys()), 7)
+        self.assertEqual(len(from_imports.keys()), 9)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
         self.assertEqual(decorators,

From 172522d19654a9e6c3d872170753086cf2452411 Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:58:23 +0800
Subject: [PATCH 52/57] [to #42322933]video-object-detection init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增video-object-detection 算法
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247489
---
 data/test/videos/test_realtime_vod.mp4        |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/realtime_object_detection/__init__.py  |   2 +
 .../realtime_video_detector.py                | 117 +++++++
 .../yolox/exp/build.py                        |   2 +
 .../yolox/exp/default/__init__.py             |   2 +-
 .../yolox/exp/default/streamyolo.py           |  43 +++
 .../yolox/exp/yolox_base.py                   |   1 -
 .../yolox/models/__init__.py                  |   3 +
 .../yolox/models/dfp_pafpn.py                 | 307 ++++++++++++++++++
 .../yolox/models/network_blocks.py            |   1 -
 .../yolox/models/streamyolo.py                |  41 +++
 .../yolox/models/tal_head.py                  | 170 ++++++++++
 modelscope/outputs.py                         |  31 +-
 ...ealtime_video_object_detection_pipeline.py |  59 ++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  60 ++++
 .../test_realtime_video_object_detection.py   |  46 +++
 18 files changed, 886 insertions(+), 5 deletions(-)
 create mode 100644 data/test/videos/test_realtime_vod.mp4
 create mode 100644 modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
 create mode 100644 modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
 create mode 100644 tests/pipelines/test_realtime_video_object_detection.py

diff --git a/data/test/videos/test_realtime_vod.mp4 b/data/test/videos/test_realtime_vod.mp4
new file mode 100644
index 00000000..a0e44852
--- /dev/null
+++ b/data/test/videos/test_realtime_vod.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87
+size 4068783
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fb99bc71..e4a26303 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -14,6 +14,7 @@ class Models(object):
     # vision models
     detection = 'detection'
     realtime_object_detection = 'realtime-object-detection'
+    realtime_video_object_detection = 'realtime-video-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
     nafnet = 'nafnet'
@@ -170,6 +171,7 @@ class Pipelines(object):
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
     realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
+    realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
     face_recognition = 'ir101-face-recognition-cfglint'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py
index aed13cec..66156977 100644
--- a/modelscope/models/cv/realtime_object_detection/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/__init__.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .realtime_detector import RealtimeDetector
+    from .realtime_video_detector import RealtimeVideoDetector
 else:
     _import_structure = {
         'realtime_detector': ['RealtimeDetector'],
+        'realtime_video_detector': ['RealtimeVideoDetector'],
     }
 
     import sys
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
new file mode 100644
index 00000000..fc7339b3
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import logging as logger
+import os
+import os.path as osp
+import time
+
+import cv2
+import json
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .yolox.data.data_augment import ValTransform
+from .yolox.exp import get_exp_by_name
+from .yolox.utils import postprocess
+
+
+@MODELS.register_module(
+    group_key=Tasks.video_object_detection,
+    module_name=Models.realtime_video_object_detection)
+class RealtimeVideoDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # model type
+        self.exp = get_exp_by_name(self.config.model_type)
+
+        # build model
+        self.model = self.exp.get_model()
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt = torch.load(model_path, map_location='cpu')
+
+        # load the model state dict
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        # params setting
+        self.exp.num_classes = self.config.num_classes
+        self.confthre = self.config.conf_thr
+        self.num_classes = self.exp.num_classes
+        self.nmsthre = self.exp.nmsthre
+        self.test_size = self.exp.test_size
+        self.preproc = ValTransform(legacy=False)
+        self.current_buffer = None
+        self.label_mapping = self.config['labels']
+
+    def inference(self, img):
+        with torch.no_grad():
+            outputs, self.current_buffer = self.model(
+                img, buffer=self.current_buffer, mode='on_pipe')
+        return outputs
+
+    def forward(self, inputs):
+        return self.inference_video(inputs)
+
+    def preprocess(self, img):
+        img = LoadImage.convert_to_ndarray(img)
+        height, width = img.shape[:2]
+        self.ratio = min(self.test_size[0] / img.shape[0],
+                         self.test_size[1] / img.shape[1])
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+
+        # Video decoding and preprocessing automatically are not supported by Pipeline/Model
+        # Sending preprocessed video frame tensor to GPU buffer self-adaptively
+        if next(self.model.parameters()).is_cuda:
+            img = img.to(next(self.model.parameters()).device)
+        return img
+
+    def postprocess(self, input):
+        outputs = postprocess(
+            input,
+            self.num_classes,
+            self.confthre,
+            self.nmsthre,
+            class_agnostic=True)
+
+        if len(outputs) == 1:
+            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
+            scores = outputs[0][:, 5].cpu().numpy()
+            labels = outputs[0][:, 6].cpu().int().numpy()
+            pred_label_names = []
+            for lab in labels:
+                pred_label_names.append(self.label_mapping[lab])
+
+        return bboxes, scores, pred_label_names
+
+    def inference_video(self, v_path):
+        outputs = []
+        desc = 'Detecting video: {}'.format(v_path)
+        for frame, result in tqdm(
+                self.inference_video_iter(v_path), desc=desc):
+            outputs.append(result)
+
+        return outputs
+
+    def inference_video_iter(self, v_path):
+        capture = cv2.VideoCapture(v_path)
+        while capture.isOpened():
+            ret, frame = capture.read()
+            if not ret:
+                break
+            output = self.preprocess(frame)
+            output = self.inference(output)
+            output = self.postprocess(output)
+            yield frame, output
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
index 4858100c..5865c53b 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -13,6 +13,8 @@ def get_exp_by_name(exp_name):
         from .default import YoloXNanoExp as YoloXExp
     elif exp == 'yolox_tiny':
         from .default import YoloXTinyExp as YoloXExp
+    elif exp == 'streamyolo':
+        from .default import StreamYoloExp as YoloXExp
     else:
         pass
     return YoloXExp()
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
index 552bbccd..cfec836c 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
@@ -1,5 +1,5 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
+from .streamyolo import StreamYoloExp
 from .yolox_nano import YoloXNanoExp
 from .yolox_s import YoloXSExp
 from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
new file mode 100644
index 00000000..5a62c8fc
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
@@ -0,0 +1,43 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import os
+import sys
+
+import torch
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class StreamYoloExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXExp, self).__init__()
+        self.depth = 1.0
+        self.width = 1.0
+        self.num_classes = 8
+        self.test_size = (600, 960)
+        self.test_conf = 0.3
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from ...models import StreamYOLO, DFPPAFPN, TALHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, 'model', None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = DFPPAFPN(
+                self.depth, self.width, in_channels=in_channels)
+            head = TALHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                gamma=1.0,
+                ignore_thr=0.5,
+                ignore_value=1.6)
+            self.model = StreamYOLO(backbone, head)
+
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
index a2a41535..c5159a9f 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -1,5 +1,4 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
 import os
 import random
 
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
index 20b1a0d1..d2e889f1 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
@@ -1,6 +1,9 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
 
 from .darknet import CSPDarknet, Darknet
+from .dfp_pafpn import DFPPAFPN
+from .streamyolo import StreamYOLO
+from .tal_head import TALHead
 from .yolo_fpn import YOLOFPN
 from .yolo_head import YOLOXHead
 from .yolo_pafpn import YOLOPAFPN
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
new file mode 100644
index 00000000..01284791
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
@@ -0,0 +1,307 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class DFPPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=('dark3', 'dark4', 'dark5'),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width),
+            int(in_channels[1] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width),
+            int(in_channels[0] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        self.jian2 = Conv(
+            in_channels=int(in_channels[0] * width),
+            out_channels=int(in_channels[0] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+        self.jian1 = Conv(
+            in_channels=int(in_channels[1] * width),
+            out_channels=int(in_channels[1] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+        self.jian0 = Conv(
+            in_channels=int(in_channels[2] * width),
+            out_channels=int(in_channels[2] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+    def off_forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        rurrent_out_features = self.backbone(torch.split(input, 3, dim=1)[0])
+        rurrent_features = [rurrent_out_features[f] for f in self.in_features]
+        [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features
+
+        rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0)  # 1024->512/32
+        rurrent_f_out0 = F.interpolate(
+            rurrent_fpn_out0, size=rurrent_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1],
+                                   1)  # 512->1024/16
+        rurrent_f_out0 = self.C3_p4(rurrent_f_out0)  # 1024->512/16
+
+        rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0)  # 512->256/16
+        rurrent_f_out1 = F.interpolate(
+            rurrent_fpn_out1, size=rurrent_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2],
+                                   1)  # 256->512/8
+        rurrent_pan_out2 = self.C3_p3(rurrent_f_out1)  # 512->256/8
+
+        rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2)  # 256->256/16
+        rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1],
+                                   1)  # 256->512/16
+        rurrent_pan_out1 = self.C3_n3(rurrent_p_out1)  # 512->512/16
+
+        rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1)  # 512->512/32
+        rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0],
+                                   1)  # 512->1024/32
+        rurrent_pan_out0 = self.C3_n4(rurrent_p_out0)  # 1024->1024/32
+
+        #####
+
+        support_out_features = self.backbone(torch.split(input, 3, dim=1)[1])
+        support_features = [support_out_features[f] for f in self.in_features]
+        [support_x2, support_x1, support_x0] = support_features
+
+        support_fpn_out0 = self.lateral_conv0(support_x0)  # 1024->512/32
+        support_f_out0 = F.interpolate(
+            support_fpn_out0, size=support_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        support_f_out0 = torch.cat([support_f_out0, support_x1],
+                                   1)  # 512->1024/16
+        support_f_out0 = self.C3_p4(support_f_out0)  # 1024->512/16
+
+        support_fpn_out1 = self.reduce_conv1(support_f_out0)  # 512->256/16
+        support_f_out1 = F.interpolate(
+            support_fpn_out1, size=support_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        support_f_out1 = torch.cat([support_f_out1, support_x2],
+                                   1)  # 256->512/8
+        support_pan_out2 = self.C3_p3(support_f_out1)  # 512->256/8
+
+        support_p_out1 = self.bu_conv2(support_pan_out2)  # 256->256/16
+        support_p_out1 = torch.cat([support_p_out1, support_fpn_out1],
+                                   1)  # 256->512/16
+        support_pan_out1 = self.C3_n3(support_p_out1)  # 512->512/16
+
+        support_p_out0 = self.bu_conv1(support_pan_out1)  # 512->512/32
+        support_p_out0 = torch.cat([support_p_out0, support_fpn_out0],
+                                   1)  # 512->1024/32
+        support_pan_out0 = self.C3_n4(support_p_out0)  # 1024->1024/32
+
+        # 0.5 channel
+        pan_out2 = torch.cat(
+            [self.jian2(rurrent_pan_out2),
+             self.jian2(support_pan_out2)],
+            dim=1) + rurrent_pan_out2
+        pan_out1 = torch.cat(
+            [self.jian1(rurrent_pan_out1),
+             self.jian1(support_pan_out1)],
+            dim=1) + rurrent_pan_out1
+        pan_out0 = torch.cat(
+            [self.jian0(rurrent_pan_out0),
+             self.jian0(support_pan_out0)],
+            dim=1) + rurrent_pan_out0
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+
+        return outputs
+
+    def online_forward(self, input, buffer=None, node='star'):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        rurrent_out_features = self.backbone(input)
+        rurrent_features = [rurrent_out_features[f] for f in self.in_features]
+        [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features
+
+        rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0)  # 1024->512/32
+        rurrent_f_out0 = F.interpolate(
+            rurrent_fpn_out0, size=rurrent_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1],
+                                   1)  # 512->1024/16
+        rurrent_f_out0 = self.C3_p4(rurrent_f_out0)  # 1024->512/16
+
+        rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0)  # 512->256/16
+        rurrent_f_out1 = F.interpolate(
+            rurrent_fpn_out1, size=rurrent_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2],
+                                   1)  # 256->512/8
+        rurrent_pan_out2 = self.C3_p3(rurrent_f_out1)  # 512->256/8
+
+        rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2)  # 256->256/16
+        rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1],
+                                   1)  # 256->512/16
+        rurrent_pan_out1 = self.C3_n3(rurrent_p_out1)  # 512->512/16
+
+        rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1)  # 512->512/32
+        rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0],
+                                   1)  # 512->1024/32
+        rurrent_pan_out0 = self.C3_n4(rurrent_p_out0)  # 1024->1024/32
+
+        #####
+        if node == 'star':
+            pan_out2 = torch.cat(
+                [self.jian2(rurrent_pan_out2),
+                 self.jian2(rurrent_pan_out2)],
+                dim=1) + rurrent_pan_out2
+            pan_out1 = torch.cat(
+                [self.jian1(rurrent_pan_out1),
+                 self.jian1(rurrent_pan_out1)],
+                dim=1) + rurrent_pan_out1
+            pan_out0 = torch.cat(
+                [self.jian0(rurrent_pan_out0),
+                 self.jian0(rurrent_pan_out0)],
+                dim=1) + rurrent_pan_out0
+        elif node == 'buffer':
+
+            [support_pan_out2, support_pan_out1, support_pan_out0] = buffer
+
+            pan_out2 = torch.cat(
+                [self.jian2(rurrent_pan_out2),
+                 self.jian2(support_pan_out2)],
+                dim=1) + rurrent_pan_out2
+            pan_out1 = torch.cat(
+                [self.jian1(rurrent_pan_out1),
+                 self.jian1(support_pan_out1)],
+                dim=1) + rurrent_pan_out1
+            pan_out0 = torch.cat(
+                [self.jian0(rurrent_pan_out0),
+                 self.jian0(support_pan_out0)],
+                dim=1) + rurrent_pan_out0
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+
+        buffer_ = (rurrent_pan_out2, rurrent_pan_out1, rurrent_pan_out0)
+
+        return outputs, buffer_
+
+    def forward(self, input, buffer=None, mode='off_pipe'):
+
+        if mode == 'off_pipe':
+            # Glops caculate mode
+            if input.size()[1] == 3:
+                input = torch.cat([input, input], dim=1)
+                output = self.off_forward(input)
+            # offline train mode
+            elif input.size()[1] == 6:
+                output = self.off_forward(input)
+
+            return output
+
+        elif mode == 'on_pipe':
+            # online star state
+            if buffer is None:
+                output, buffer_ = self.online_forward(input, node='star')
+            # online inference
+            else:
+                assert len(buffer) == 3
+                assert input.size()[1] == 3
+                output, buffer_ = self.online_forward(
+                    input, buffer=buffer, node='buffer')
+
+            return output, buffer_
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
index fd15c1c1..88bd55c7 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
@@ -1,5 +1,4 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
new file mode 100644
index 00000000..b3ec3504
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
@@ -0,0 +1,41 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch.nn as nn
+
+from .dfp_pafpn import DFPPAFPN
+from .tal_head import TALHead
+
+
+class StreamYOLO(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is None:
+            backbone = DFPPAFPN()
+        if head is None:
+            head = TALHead(20)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None, buffer=None, mode='off_pipe'):
+        # fpn output content features of [dark3, dark4, dark5]
+        assert mode in ['off_pipe', 'on_pipe']
+
+        if mode == 'off_pipe':
+            fpn_outs = self.backbone(x, buffer=buffer, mode='off_pipe')
+            if self.training:
+                pass
+            else:
+                outputs = self.head(fpn_outs, imgs=x)
+
+            return outputs
+        elif mode == 'on_pipe':
+            fpn_outs, buffer_ = self.backbone(x, buffer=buffer, mode='on_pipe')
+            outputs = self.head(fpn_outs)
+
+            return outputs, buffer_
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
new file mode 100644
index 00000000..7a82f8c6
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
@@ -0,0 +1,170 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .network_blocks import BaseConv, DWConv
+
+
+class TALHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act='silu',
+        depthwise=False,
+        gamma=1.5,
+        ignore_thr=0.2,
+        ignore_value=0.2,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.gamma = gamma
+        self.ignore_thr = ignore_thr
+        self.ignore_value = ignore_value
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                ))
+            self.cls_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.reg_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+        self.expanded_strides = [None] * len(in_channels)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                pass
+
+            else:
+                output = torch.cat(
+                    [reg_output,
+                     obj_output.sigmoid(),
+                     cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            pass
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
+                                dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c08779b4..a49ddacf 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -165,6 +165,32 @@ TASK_OUTPUTS = {
     Tasks.image_object_detection:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
 
+    # video object detection result for single sample
+    #   {
+
+    #         "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
+    #         "labels": [["person", "traffic light", "car", "bus"],
+    #                     ["person", "traffic light", "car", "bus"]]
+    #         "boxes":
+    #          [
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #              ],
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #               ]
+    #           ],
+
+    #   }
+    Tasks.video_object_detection:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
+
     # instance segmentation result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05],
@@ -676,8 +702,9 @@ TASK_OUTPUTS = {
     #   "text_embedding": np.array with shape [1, D],
     #   "similarity": float
     # }
-    Tasks.multi_modal_similarity:
-    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+    Tasks.multi_modal_similarity: [
+        OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES
+    ],
 
     # VQA result for a sample
     # {"text": "this is a text answser. "}
diff --git a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
new file mode 100644
index 00000000..3686c50a
--- /dev/null
+++ b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import json
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.realtime_object_detection import \
+    RealtimeVideoDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_object_detection,
+    module_name=Pipelines.realtime_video_object_detection)
+class RealtimeVideoObjectDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.model = RealtimeVideoDetector(model)
+
+    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
+        return input
+
+    def forward(self, input: Input) -> Dict[Tensor, Dict[str, np.ndarray]]:
+        self.video_path = input
+        # Processing the whole video and return results for each frame
+        forward_output = self.model.inference_video(self.video_path)
+        return {'forward_output': forward_output}
+
+    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
+                    **kwargs) -> str:
+        forward_output = input['forward_output']
+
+        scores, boxes, labels = [], [], []
+        for result in forward_output:
+            box, score, label = result
+            scores.append(score)
+            boxes.append(box)
+            labels.append(label)
+
+        return {
+            OutputKeys.BOXES: boxes,
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 9e10e802..0eb369da 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_classification_dailylife = 'image-classification-dailylife'
 
     image_object_detection = 'image-object-detection'
+    video_object_detection = 'video-object-detection'
 
     image_segmentation = 'image-segmentation'
     semantic_segmentation = 'semantic-segmentation'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 2d420892..34dc2348 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -231,6 +231,66 @@ def show_video_tracking_result(video_in_path, bboxes, video_save_path):
     cap.release()
 
 
+def show_video_object_detection_result(video_in_path, bboxes_list, labels_list,
+                                       video_save_path):
+
+    PALETTE = {
+        'person': [128, 0, 0],
+        'bicycle': [128, 128, 0],
+        'car': [64, 0, 0],
+        'motorcycle': [0, 128, 128],
+        'bus': [64, 128, 0],
+        'truck': [192, 128, 0],
+        'traffic light': [64, 0, 128],
+        'stop sign': [192, 0, 128],
+    }
+    from tqdm import tqdm
+    import math
+    cap = cv2.VideoCapture(video_in_path)
+    with tqdm(total=len(bboxes_list)) as pbar:
+        pbar.set_description(
+            'Writing results to video: {}'.format(video_save_path))
+        for i in range(len(bboxes_list)):
+            bboxes = bboxes_list[i].astype(int)
+            labels = labels_list[i]
+            success, frame = cap.read()
+            if success is False:
+                raise Exception(video_in_path,
+                                ' can not be correctly decoded by OpenCV.')
+            if i == 0:
+                size = (frame.shape[1], frame.shape[0])
+                fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+                video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                               cap.get(cv2.CAP_PROP_FPS), size,
+                                               True)
+
+            FONT_SCALE = 1e-3  # Adjust for larger font size in all images
+            THICKNESS_SCALE = 1e-3  # Adjust for larger thickness in all images
+            TEXT_Y_OFFSET_SCALE = 1e-2  # Adjust for larger Y-offset of text and bounding box
+            H, W, _ = frame.shape
+            zeros_mask = np.zeros((frame.shape)).astype(np.uint8)
+            for bbox, l in zip(bboxes, labels):
+                cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                              PALETTE[l], 1)
+                cv2.putText(
+                    frame,
+                    l, (bbox[0], bbox[1] - int(TEXT_Y_OFFSET_SCALE * H)),
+                    fontFace=cv2.FONT_HERSHEY_TRIPLEX,
+                    fontScale=min(H, W) * FONT_SCALE,
+                    thickness=math.ceil(min(H, W) * THICKNESS_SCALE),
+                    color=PALETTE[l])
+                zeros_mask = cv2.rectangle(
+                    zeros_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                    color=PALETTE[l],
+                    thickness=-1)
+
+            frame = cv2.addWeighted(frame, 1., zeros_mask, .65, 0)
+            video_writer.write(frame)
+            pbar.update(1)
+    video_writer.release
+    cap.release()
+
+
 def panoptic_seg_masks_to_image(masks):
     draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
     from mmdet.core.visualization.palette import get_palette
diff --git a/tests/pipelines/test_realtime_video_object_detection.py b/tests/pipelines/test_realtime_video_object_detection.py
new file mode 100644
index 00000000..d65313a3
--- /dev/null
+++ b/tests/pipelines/test_realtime_video_object_detection.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_object_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class RealtimeVideoObjectDetectionTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_cspnet_video-object-detection_streamyolo'
+        self.test_video = 'data/test/videos/test_realtime_vod.mp4'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        realtime_video_object_detection = pipeline(
+            Tasks.video_object_detection, model=self.model_id)
+        result = realtime_video_object_detection(self.test_video)
+        if result:
+            logger.info('Video output to test_vod_results.avi')
+            show_video_object_detection_result(self.test_video,
+                                               result[OutputKeys.BOXES],
+                                               result[OutputKeys.LABELS],
+                                               'test_vod_results.avi')
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From e3eb01f4cee8c39a66c797cfdf8e29917011424a Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 17 Oct 2022 23:31:44 +0800
Subject: [PATCH 53/57] [to #42322933]update word-segmentation regression
 results         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10432186

---
 data/test/regression/sbert_ws_en.bin | 4 ++--
 data/test/regression/sbert_ws_zh.bin | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
index 4eb562d6..6e441f7f 100644
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
-size 60801
+oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
+size 61239
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index 555f640d..b1841351 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
-size 60801
+oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
+size 61115

From 2eb835aca489f5b7dcdfb6199d34f1bfc85f6d7c Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 18 Oct 2022 11:12:12 +0800
Subject: [PATCH 54/57] [to #42322933]Add uuid to model which created by ut
 test

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107

    * [Update] update finetune_result_upload

* [Update] rename finetune_result_upload to model_dir_upload

* Merge branch 'master' into feat/upload_ckpt

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix import error

* [Fix] fix import error

* Merge branch 'master' into feat/upload_ckpt

* [Update] changes name to upload_folder and using tempfile to save repo

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix commit

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix format

* Merge branch 'master' into feat/upload_ckpt

* [Fix] add uuid after model created from upload ut
---
 tests/hub/test_hub_upload.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
index d7e6e439..2250164b 100644
--- a/tests/hub/test_hub_upload.py
+++ b/tests/hub/test_hub_upload.py
@@ -3,6 +3,7 @@ import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 from modelscope.hub.api import HubApi
 from modelscope.hub.constants import Licenses, ModelVisibility
@@ -23,7 +24,9 @@ class HubUploadTest(unittest.TestCase):
         self.api = HubApi()
         self.user = os.environ.get('TEST_MODEL_ORG', 'citest')
         logger.info(self.user)
-        self.create_model_name = '%s/%s' % (self.user, 'test_model_upload')
+        self.create_model_name = '%s/%s_%s' % (self.user, 'test_model_upload',
+                                               uuid.uuid4().hex)
+        logger.info('create %s' % self.create_model_name)
         temporary_dir = tempfile.mkdtemp()
         self.work_dir = temporary_dir
         self.model_dir = os.path.join(temporary_dir, self.create_model_name)

From c0b546a96eaaaaef2e9ab1bf32b1abe9092d33e1 Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Tue, 18 Oct 2022 14:34:26 +0800
Subject: [PATCH 55/57] [to #42322933]add subset_name when loading dataset
 (NAFNet image denoising)         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10427797

---
 tests/trainers/test_image_denoise_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index 0bcb8930..68ddf616 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -33,11 +33,13 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         dataset_train = MsDataset.load(
             'SIDD',
             namespace='huizheng',
+            subset_name='default',
             split='validation',
             download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
         dataset_val = MsDataset.load(
             'SIDD',
             namespace='huizheng',
+            subset_name='default',
             split='test',
             download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
         self.dataset_train = SiddImageDenoisingDataset(

From 3b1f1a0252d4fee7ecd15ac8dc7c04ec0535add0 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 18 Oct 2022 15:58:33 +0800
Subject: [PATCH 56/57] [to #42322933] Add GPT3 tensor parallel inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加基于 Megatron-v3 的 GPT3 tensor 并行的推理代码
复用 DistributedPipeline 与 megatron-util
适用模型：1.3B/2.7B/13B 参数的 GPT-3 预训练生成模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10416721
---
 modelscope/metainfo.py                        |    2 +
 modelscope/models/nlp/gpt3/__init__.py        |    2 +
 .../models/nlp/gpt3/configuration_gpt3.py     |   88 +-
 .../models/nlp/gpt3/distributed_gpt3.py       | 1057 +++++++++++++++++
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |   54 +-
 modelscope/models/nlp/gpt3/tokenizer_gpt3.py  |   69 ++
 .../nlp/distributed_gpt3_pipeline.py          |   54 +
 modelscope/preprocessors/__init__.py          |    2 +
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 modelscope/preprocessors/nlp/nlp_base.py      |   35 +
 modelscope/utils/nlp/distributed.py           |    5 +-
 tests/pipelines/test_gpt3_text_generation.py  |   58 +
 12 files changed, 1387 insertions(+), 41 deletions(-)
 create mode 100644 modelscope/models/nlp/gpt3/distributed_gpt3.py
 create mode 100644 modelscope/models/nlp/gpt3/tokenizer_gpt3.py
 create mode 100644 modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
 create mode 100644 tests/pipelines/test_gpt3_text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index e4a26303..2dbff948 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -227,6 +227,7 @@ class Pipelines(object):
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
     plug_generation = 'plug-generation'
+    gpt3_generation = 'gpt3-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
     table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -324,6 +325,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     text2text_gen_preprocessor = 'text2text-gen-preprocessor'
+    text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer'
     text2text_translate_preprocessor = 'text2text-translate-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
index 076a0c6b..9cae8cc8 100644
--- a/modelscope/models/nlp/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -7,11 +7,13 @@ if TYPE_CHECKING:
     from .configuration_gpt3 import GPT3Config
     from .modeling_gpt3 import GPT3Model
     from .gpt3_for_text_generation import GPT3ForTextGeneration
+    from .tokenizer_gpt3 import JiebaBPETokenizer
 else:
     _import_structure = {
         'configuration_gpt3': ['GPT3Config'],
         'modeling_gpt3': ['GPT3Model'],
         'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
+        'tokenizer_gpt3': ['JiebaBPETokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py
index d5a054fd..66e8b836 100644
--- a/modelscope/models/nlp/gpt3/configuration_gpt3.py
+++ b/modelscope/models/nlp/gpt3/configuration_gpt3.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
@@ -21,25 +22,48 @@ logger = logging.get_logger(__name__)
 
 class GPT3Config(PretrainedConfig):
 
-    model_type = 'gpt'
+    model_type = 'gpt3'
 
-    def __init__(self,
-                 vocab_size=25600,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act='gelu',
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=2048,
-                 type_vocab_size=2,
-                 layernorm_epsilon=1e-12,
-                 **kwargs):
+    def __init__(
+            self,
+            vocab_size=25600,
+            hidden_size=768,
+            ffn_hidden_size=None,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=2048,
+            type_vocab_size=2,
+            layernorm_epsilon=1e-12,
+            bias_gelu_fusion=True,
+            fp32_residual_connection=False,
+            sequence_parallel=False,
+            fp16=False,
+            bf16=False,
+            apply_query_key_layer_scaling=True,
+            attention_softmax_in_fp32=False,
+            kv_channels=None,
+            masked_softmax_fusion=True,
+            attention_dropout=0.1,
+            bias_dropout_fusion=True,
+            apply_residual_connection_post_layernorm=False,
+            hidden_dropout=0.1,
+            init_method_std=0.02,
+            # generate
+            eod_id=7,
+            tokens_to_generate=100,
+            top_k=0,
+            top_p=0.9,
+            **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
+        self.ffn_hidden_size = 4 * hidden_size \
+            if ffn_hidden_size is None else ffn_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.hidden_act = hidden_act
@@ -49,3 +73,39 @@ class GPT3Config(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.layernorm_epsilon = layernorm_epsilon
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.fp32_residual_connection = fp32_residual_connection
+        self.sequence_parallel = sequence_parallel
+        self.fp16 = fp16
+        self.bf16 = bf16
+        assert not (fp16 and bf16)
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        if kv_channels is None:
+            assert hidden_size % num_attention_heads == 0
+            self.kv_channels = hidden_size // num_attention_heads
+        self.masked_softmax_fusion = masked_softmax_fusion
+        self.attention_dropout = attention_dropout
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.apply_residual_connection_post_layernorm = \
+            apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.init_method_std = init_method_std
+        self.eod_id = eod_id
+        self.tokens_to_generate = tokens_to_generate
+        self.top_k = top_k
+        self.top_p = top_p
+
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        self.no_persist_layer_norm = \
+            TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11)
+
+    @property
+    def params_dtype(self):
+        if self.fp16:
+            return torch.half
+        elif self.bf16:
+            return torch.bfloat16
+        else:
+            return torch.float
diff --git a/modelscope/models/nlp/gpt3/distributed_gpt3.py b/modelscope/models/nlp/gpt3/distributed_gpt3.py
new file mode 100644
index 00000000..a0091259
--- /dev/null
+++ b/modelscope/models/nlp/gpt3/distributed_gpt3.py
@@ -0,0 +1,1057 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from megatron import mpu
+from megatron.global_vars import get_global_memory_buffer, set_global_variables
+from megatron.model import (AttnMaskType, Float16Module, LayerNorm,
+                            bias_gelu_impl)
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.models import TorchModel
+from modelscope.models.nlp.gpt3 import GPT3Config
+from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.nlp.load_checkpoint import pre_load
+from modelscope.utils.torch_utils import set_random_seed_mpu
+
+
+class GPT3ParallelMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method):
+        super().__init__()
+
+        # Project to 4h.
+        self.dense_h_to_4h = mpu.ColumnParallelLinearV3(
+            config,
+            config.hidden_size,
+            config.ffn_hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True)
+
+        self.bias_gelu_fusion = config.bias_gelu_fusion
+        self.activation_func = F.gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinearV3(
+            config,
+            config.ffn_hidden_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(
+            hidden_states)
+
+        if self.bias_gelu_fusion:
+            intermediate_parallel = \
+                bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = \
+                self.activation_func(intermediate_parallel + bias_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+
+class GPT3Embedding(nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self, config, init_method):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.init_method = init_method
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size, self.hidden_size, init_method=self.init_method)
+
+        # Position embedding (serial).
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                self.hidden_size)
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.sequence_parallel = config.sequence_parallel
+        # Embeddings dropout
+        self.embedding_dropout = nn.Dropout(config.hidden_dropout)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+
+    def forward(self, input_ids, position_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        if self.sequence_parallel:
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+            with mpu.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+
+class NoopTransformerLayer(nn.Module):
+
+    def __init__(self, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                encoder_output=None,
+                enc_dec_attn_mask=None,
+                inference_params=None):
+        return hidden_states.clone()
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+class GPT3CoreAttention(nn.Module):
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 attn_mask_type=AttnMaskType.padding):
+        super().__init__()
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = config.sequence_parallel
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            config.num_attention_heads, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16, self.attn_mask_type,
+            config.masked_softmax_fusion, attention_mask_func,
+            self.attention_softmax_in_fp32, coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1), query_layer.size(2),
+                       query_layer.size(0), key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype, 'mpu')
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),  # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.sequence_parallel:
+            with mpu.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1), value_layer.size(2),
+                       query_layer.size(0), value_layer.size(3))
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(
+            value_layer.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class GPT3ParallelAttention(nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method,
+                 layer_number):
+        super().__init__()
+        self.layer_number = max(1, layer_number)
+        self.params_dtype = config.params_dtype
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            config.num_attention_heads, world_size)
+
+        # Strided linear layer.
+        self.query_key_value = mpu.ColumnParallelLinearV3(
+            config,
+            config.hidden_size,
+            3 * projection_size,
+            gather_output=False,
+            init_method=init_method)
+
+        self.core_attention = GPT3CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = mpu.RowParallelLinearV3(
+            config,
+            projection_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size):
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            dtype=self.params_dtype,
+            device=torch.cuda.current_device())
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_len
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        (query_layer, key_layer,
+         value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                 batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                   batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[:sequence_end,
+                                             batch_start:batch_end, ...]
+            value_layer = inference_value_memory[:sequence_end,
+                                                 batch_start:batch_end, ...]
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer,
+                                            value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+
+        return output, bias
+
+
+class nullcontext:
+
+    def __init__(self, enter_result=None):
+        self.enter_result = enter_result
+
+    def __enter__(self):
+        return self.enter_result
+
+    def __exit__(self, *excinfo):
+        pass
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor, bias: torch.Tensor,
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor, bias: torch.Tensor,
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class GPT3ParallelTransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method,
+                 layer_number):
+
+        super().__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm \
+            = config.apply_residual_connection_post_layernorm
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=config.no_persist_layer_norm,
+            sequence_parallel=config.sequence_parallel)
+
+        # Self attention.
+        self.self_attention = GPT3ParallelAttention(config, init_method,
+                                                    output_layer_init_method,
+                                                    layer_number)
+        self.hidden_dropout = config.hidden_dropout
+        self.bias_dropout_fusion = config.bias_dropout_fusion
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=config.no_persist_layer_norm,
+            sequence_parallel=config.sequence_parallel)
+
+        # MLP
+        self.mlp = GPT3ParallelMLP(config, init_method,
+                                   output_layer_init_method)
+
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1
+                                          and TORCH_MINOR >= 10)
+        self.bias_dropout_add_exec_handler = \
+            nullcontext if use_nvfuser else torch.enable_grad
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                layernorm_output,
+                attention_mask,
+                inference_params=inference_params)
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        if self.bias_dropout_fusion:
+            if self.training:
+                bias_dropout_add_func = bias_dropout_add_fused_train
+            else:
+                bias_dropout_add_func = bias_dropout_add_fused_inference
+        else:
+            bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = bias_dropout_add_func(
+                attention_output, attention_bias.expand_as(residual), residual,
+                self.hidden_dropout)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        with self.bias_dropout_add_exec_handler():
+            output = bias_dropout_add_func(mlp_output,
+                                           mlp_bias.expand_as(residual),
+                                           residual, self.hidden_dropout)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = mpu.make_viewless_tensor(
+            inp=output, requires_grad=output.requires_grad, keep_graph=True)
+
+        return output
+
+
+class GPT3ParallelTransformer(nn.Module):
+    """Transformer class."""
+
+    def __init__(self,
+                 config,
+                 init_method,
+                 output_layer_init_method,
+                 post_layer_norm=True,
+                 pre_process=True,
+                 post_process=True):
+        super().__init__()
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+
+        self.sequence_parallel = config.sequence_parallel
+
+        # Number of layers.
+        self.num_layers = config.num_hidden_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GPT3ParallelTransformerLayer(config, init_method,
+                                                output_layer_init_method,
+                                                layer_number)
+
+        if self.num_layers == 0:
+            self.num_layers = 1
+            self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        else:
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                no_persist_layer_norm=config.no_persist_layer_norm,
+                sequence_parallel=config.sequence_parallel)
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [s, b, h]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = mpu.make_viewless_tensor(
+            hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        if self.sequence_parallel:
+            rng_context = mpu.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        with rng_context:
+            # Forward pass.
+            for index in range(self.num_layers):
+                layer = self._get_layer(index)
+                hidden_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    inference_params=inference_params)
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class GPT3TransformerLanguageModel(nn.Module):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.init_method = init_method
+        self.encoder_hidden_state = None
+
+        # Embeddings.
+        self.embedding = GPT3Embedding(config, self.init_method)
+
+        # Transformer.
+        self.encoder = GPT3ParallelTransformer(
+            config,
+            self.init_method,
+            output_layer_init_method,
+        )
+
+    def forward(self,
+                enc_input_ids,
+                enc_position_ids,
+                enc_attn_mask,
+                inference_params=None,
+                enc_hidden_states=None):
+
+        # Encoder embedding.
+        encoder_input = self.embedding(enc_input_ids, enc_position_ids)
+
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    inference_params=inference_params)
+            else:
+                encoder_output = self.encoder_hidden_state
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+
+        return encoder_output
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT3Model(PreTrainedModel):
+
+    config_class = GPT3Config
+
+    def __init__(self, config, parallel_output=False):
+        super().__init__(config)
+
+        self.parallel_output = parallel_output
+
+        self.language_model = GPT3TransformerLanguageModel(
+            config, init_method_normal(config.init_method_std),
+            scaled_init_method_normal(config.init_method_std,
+                                      config.num_hidden_layers))
+
+    def word_embeddings_weight(self):
+        return self.language_model.embedding.word_embeddings.weight
+
+    @staticmethod
+    def build_attention_mask_and_position_ids(tokens):
+        seq_length = tokens.size(1)
+        attention_mask = torch.tril(
+            torch.ones((1, 1, seq_length, seq_length),
+                       dtype=torch.long,
+                       device=tokens.device))
+        attention_mask = (attention_mask < 0.5)
+
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=tokens.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(tokens)
+
+        return attention_mask, position_ids
+
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                position_ids=None,
+                inference_params=None,
+                **kwargs):
+        if attention_mask is None and position_ids is None:
+            attention_mask, position_ids = \
+                self.build_attention_mask_and_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params)
+
+        logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
+            lm_output, self.word_embeddings_weight(), None, False, True,
+            self.config.sequence_parallel)
+        # Gather if needed.
+
+        output = logits_parallel
+        if not self.parallel_output:
+            output = mpu.gather_from_model_parallel_region(logits_parallel)
+        return output.transpose(0, 1).contiguous()
+
+
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf."""
+
+    filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf."""
+
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+    # Filteration based on the cumulative sum.
+    filter_ = cumulative_probs > top_p
+    # This shift by 1 is weird and I cannot justify it. This existed
+    # in the original implementation:
+    #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+    # and I guess it is needed so keeping it for now.
+    filter_[:, 1:] = filter_[:, :-1].clone()
+    # Make sure we at least have one token to select from.
+    filter_[..., 0] = 0
+
+    # Fill in the filtered part
+    filter_ = filter_.scatter(1, sorted_indices, filter_)
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
+    """ Sample and generate a token.
+    Note: logits has the dimension [b, v] where b is the batch size
+          and v is the vocabulary size.
+    If vocab_size is provided, we will make sure the sample that is
+    generated is in [0, vocab-size). This will avoid out of vocabulary
+    generations due to padding.
+    """
+
+    # Check logits for consistency.
+    assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
+    assert logits.type() == 'torch.cuda.FloatTensor', \
+        'input logits should be floats.'
+
+    # Greedy is just simple argmax.
+    if top_k == 1:
+        assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
+        samples = torch.argmax(logits, dim=-1)
+
+    # Top-k or top-p sampling.
+    else:
+        # Clone so we do not modify the inputs,
+        logits = logits.clone()
+        # Apply temperature in place.
+        if temperature != 1.0:
+            logits.div_(temperature)
+
+        if top_k > 1:
+            assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
+            assert top_k <= logits.size(1), 'top-k is larger than logit size.'
+            if vocab_size:
+                assert top_k < vocab_size, 'top-k is larger than vocab size.'
+            modify_logits_for_top_k_filtering(logits, top_k)
+
+        elif top_p > 0.0:
+            assert top_p <= 1.0, 'top-p should be in (0, 1].'
+            modify_logits_for_top_p_filtering(logits, top_p)
+
+        # After filtering, we need to recalculate the distribution.
+        probs = logits.softmax(dim=-1)
+        samples = torch.multinomial(probs, num_samples=1).view(-1)
+
+    # If vocab size is provided, make sure the samples are in
+    # in the range [0, vocab-size).
+    if vocab_size:
+        samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
+
+    return samples
+
+
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    def __init__(self, max_batch_size, max_sequence_len):
+        """Note that offsets are set to zero and we always set the
+        flag to allocate memory. After the first call, make sure to
+        set this flag to False."""
+        self.max_sequence_len = max_sequence_len
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+
+    def swap_key_value_dict(self, batch_idx):
+        'swap between batches'
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError('should not swap when dict in empty')
+
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[
+                layer_number]
+            assert len(batch_idx) == inference_key_memory.shape[
+                1]  # make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                new_inference_key_memory, new_inference_value_memory)
+
+
+class DistributedGPT3(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 rank,
+                 path_load_tag='model',
+                 *args,
+                 **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        initialize_distributed(rank, mpu, kwargs['world_size'],
+                               kwargs['model_parallel_size'],
+                               kwargs['master_ip'], kwargs['master_port'])
+        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        set_random_seed_mpu(seed)
+        set_global_variables()
+
+        self.config = GPT3Config.from_pretrained(model_dir)
+        # Build model.
+        model = GPT3Model(self.config)
+
+        for param in model.parameters():
+            mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if self.config.fp16 or self.config.bf16:
+            model = Float16Module(model, self.config)
+
+        self.dist_model = model
+        load_model = pre_load(mpu, model_dir, tag=path_load_tag)
+        self.dist_model.load_state_dict(load_model)
+
+        self.inference_params = None
+
+    def forward_step(self, tokens, attention_mask, position_ids):
+        logits = self.dist_model(
+            tokens,
+            attention_mask,
+            position_ids,
+            inference_params=self.inference_params)
+        self.inference_params.sequence_len_offset += tokens.size(1)
+        return logits
+
+    def generate(self,
+                 tokens,
+                 temperature=1.0,
+                 use_eod_token_for_early_termination=True,
+                 stop_on_double_eol=False,
+                 stop_on_eol=False):
+        lengths = torch.tensor([tokens.size(1)], device=tokens.device)
+        pads = torch.ones(
+            1, self.config.tokens_to_generate,
+            device=tokens.device).long() * self.config.eod_id
+        tokens = torch.cat((tokens, pads), dim=-1)
+
+        batch_size = tokens.size(0)
+        min_prompt_length = lengths.min().item()
+        max_sequence_length = tokens.size(1)
+        max_sequence_length = min(max_sequence_length,
+                                  self.config.max_position_embeddings)
+
+        # If the context is too big, this happens
+        if min_prompt_length >= max_sequence_length:
+            raise ValueError('context length + tokens_to_generate too large')
+
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(batch_size,
+                                                max_sequence_length)
+
+        # Added termination_id to support the case that we want to terminate the
+        # generation once that id is generated.
+        termination_id = self.config.eod_id
+
+        # Whether we have reached a termination id.
+        is_generation_done = torch.zeros(
+            batch_size, dtype=torch.uint8, device=torch.cuda.current_device())
+
+        # =============
+        # Run infernece
+        # =============
+
+        with torch.no_grad():
+            attention_mask, position_ids = \
+                GPT3Model.build_attention_mask_and_position_ids(tokens)
+            prev_context_length = 0
+            for context_length in range(min_prompt_length,
+                                        max_sequence_length):
+
+                # Pick the slice that we need to pass through the network.
+                tokens2use = tokens[:, prev_context_length:context_length]
+                positions2use = position_ids[:, prev_context_length:
+                                             context_length]
+                attention_mask2use = attention_mask[
+                    ..., prev_context_length:context_length, :context_length]
+
+                # logits will be meanigful only in the last pipeline stage.
+                logits = self.forward_step(tokens2use, attention_mask2use,
+                                           positions2use)
+
+                # Sample.
+                last_token_logits = logits[:, -1, :]
+                new_sample = sample(
+                    last_token_logits,
+                    top_k=self.config.top_k,
+                    top_p=self.config.top_p,
+                    temperature=temperature,
+                    vocab_size=self.config.vocab_size)
+
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+
+                # Update the context length for the next token generation.
+                prev_context_length = context_length
+
+                # instead tokenization should be in the inference loop so stop sequences can be used
+                if stop_on_double_eol:
+                    hit_double_eol = (new_sample
+                                      == 628).byte() & started.byte()
+                    hit_two_eols = (new_sample == 198).byte() & (
+                        tokens[:, context_length - 1]
+                        == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_two_eols
+                elif stop_on_eol:
+                    hit_double_eol = (new_sample
+                                      == 628).byte() & started.byte()
+                    hit_eol = (new_sample == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_eol
+                else:
+                    done_token = (new_sample == termination_id).byte() & \
+                        started.byte()
+
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+
+                if use_eod_token_for_early_termination and done:
+                    break
+
+        tokens = tokens[:, :(context_length + 1)]
+        return tokens
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index ade36e36..2c23f5db 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -19,8 +19,7 @@ from typing import Optional, Union
 
 import addict
 import torch
-from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear,
-                      Module, Softmax)
+from torch import nn
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
@@ -28,7 +27,7 @@ from modelscope.utils.constant import ModelFile
 from .configuration_gpt3 import GPT3Config
 
 
-class GPT3SelfAttention(Module):
+class GPT3SelfAttention(nn.Module):
     """Parallel self-attention layer abstract class.
 
     Self-attention layer takes input with size [s, b, h]
@@ -44,13 +43,15 @@ class GPT3SelfAttention(Module):
         self.hidden_size_per_attention_head = \
             self.hidden_size // self.num_attention_heads
 
-        self.query_key_value = Linear(self.hidden_size, 3 * self.hidden_size)
-        self.softmax = Softmax(dim=-1)
-        self.attention_dropout = Dropout(config.attention_probs_dropout_prob)
+        self.query_key_value = nn.Linear(self.hidden_size,
+                                         3 * self.hidden_size)
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_dropout = nn.Dropout(
+            config.attention_probs_dropout_prob)
 
         # Output.
-        self.dense = Linear(self.hidden_size, self.hidden_size)
-        self.output_dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def _transpose_for_scores(self, tensor):
         """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
@@ -133,7 +134,7 @@ class GPT3SelfAttention(Module):
         return output
 
 
-class GPT3MLP(Module):
+class GPT3MLP(nn.Module):
     """MLP.
 
     MLP will take the input with h hidden state, project it to 4*h
@@ -146,12 +147,12 @@ class GPT3MLP(Module):
 
         hidden_size = config.hidden_size
         # Project to 4h.
-        self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size)
+        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
         self.activation_func = F.gelu
         # Project back to h.
-        self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size)
+        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
 
-        self.dropout = Dropout(config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states):
 
@@ -164,7 +165,7 @@ class GPT3MLP(Module):
         return output
 
 
-class GPT3TransformerLayer(Module):
+class GPT3TransformerLayer(nn.Module):
     """A single transformer layer.
 
     Transformer layer takes input with size [s, b, h] and returns an
@@ -175,14 +176,14 @@ class GPT3TransformerLayer(Module):
         super().__init__()
 
         # Layernorm on the input data.
-        self.input_layernorm = LayerNorm(
+        self.input_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # Self attention.
         self.attention = GPT3SelfAttention(config)
 
         # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNorm(
+        self.post_attention_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # MLP
@@ -208,7 +209,7 @@ class GPT3TransformerLayer(Module):
         return output
 
 
-class GPT3Transformer(Module):
+class GPT3Transformer(nn.Module):
     """Transformer class."""
 
     def __init__(self, config):
@@ -223,7 +224,7 @@ class GPT3Transformer(Module):
             [GPT3TransformerLayer(config) for _ in range(self.num_layers)])
 
         # Final layer norm before output.
-        self.final_layernorm = LayerNorm(
+        self.final_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
     def _get_layer(self, layer_number):
@@ -242,7 +243,7 @@ class GPT3Transformer(Module):
         return hidden_states
 
 
-class GPT3TransformerLanguageModel(Module):
+class GPT3TransformerLanguageModel(nn.Module):
     """Transformer language model.
 
     Arguments:
@@ -259,10 +260,11 @@ class GPT3TransformerLanguageModel(Module):
         super().__init__()
 
         # Embeddings.
-        self.word_embeddings = Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = Embedding(config.max_position_embeddings,
-                                             config.hidden_size)
-        self.embedding_dropout = Dropout(config.hidden_dropout_prob)
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.embedding_dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # Transformer.
         self.transformer = GPT3Transformer(config)
@@ -286,19 +288,19 @@ class GPT3Model(PreTrainedModel):
 
     def _init_weights(self, module):
         """Initialize the weights"""
-        if isinstance(module, Linear):
+        if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(
                 mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, Embedding):
+        elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(
                 mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
+        elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
@@ -325,7 +327,7 @@ class GPT3Model(PreTrainedModel):
         logits = self.language_model(input_ids, attention_mask, position_ids)
         loss = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
+            loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(
                 logits.view(-1, self.config.vocab_size), labels.view(-1))
         return addict.Dict(loss=loss, logits=logits)
diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py
new file mode 100644
index 00000000..5780ddbd
--- /dev/null
+++ b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tokenizers import Tokenizer
+
+
+class JiebaBPETokenizer:
+    """SentencePiece BPE tokenizer with Jieba integration"""
+
+    def __init__(self, tokenizer_json_file):
+        self.name = 'Jieba BPE Tokenizer'
+
+        self.tokenizer = Tokenizer.from_file(tokenizer_json_file)
+        self.eod_id = self.tokenizer.token_to_id('<|endoftext|>')
+        try:
+            import jieba
+        except ImportError:
+            raise ImportError(
+                'You need to install rjieba to use JiebaTokenizer. '
+                'See https://pypi.org/project/rjieba/ for installation.')
+        self.jieba = jieba
+        self.new_line = self.vocab['\n']
+        self.sep_token = self.vocab['<sep>']
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.get_vocab_size(with_added_tokens=True)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.get_vocab(with_added_tokens=True)
+
+    @property
+    def inv_vocab(self):
+        vocab = self.vocab
+        inv_vocab = dict()
+        for key, val in vocab.items():
+            inv_vocab[val] = key
+        return inv_vocab
+
+    def tokenize(self, text, is_code=False):
+        """
+        """
+        if not is_code:
+            seg_list = [x for x in self.jieba.cut(text)]
+            return self.tokenizer.encode(
+                seg_list, is_pretokenized=True, add_special_tokens=True).ids
+        else:
+            return self.tokenizer.encode(
+                text, is_pretokenized=False, add_special_tokens=True).ids
+
+    def detokenize(self, token_ids):
+        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)
+        return text
+
+    @property
+    def eod(self):
+        return self.eod_id
diff --git a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
new file mode 100644
index 00000000..325d3303
--- /dev/null
+++ b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp.gpt3.distributed_gpt3 import DistributedGPT3
+from modelscope.pipelines.base import DistributedPipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TextGenerationJiebaPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.gpt3_generation)
+class DistributedGPT3Pipeline(DistributedPipeline):
+    """This class is used to instantiate the gpt3 model.
+    """
+
+    model = None
+
+    def __init__(self, model, preprocessor=None, **kwargs):
+        if preprocessor is None:
+            preprocessor = TextGenerationJiebaPreprocessor(model)
+        super().__init__(model, preprocessor=preprocessor, **kwargs)
+        assert hasattr(preprocessor, 'tokenizer')
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        cls.model = DistributedGPT3(model_dir, rank, **kwargs)
+        cls.model.eval()
+
+    @classmethod
+    def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        tokens = inputs['inputs']['input_ids'].cuda(
+            torch.cuda.current_device())
+        return cls.model.generate(tokens)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        from modelscope.outputs import OutputKeys
+        return {
+            OutputKeys.TEXT:
+            self.preprocessor.tokenizer.detokenize(inputs[0].tolist())
+        }
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 43fa64a7..f7defd92 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor,
         SentencePiecePreprocessor,
     )
     from .space import (DialogIntentPredictionPreprocessor,
@@ -72,6 +73,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'TextGenerationJiebaPreprocessor',
             'SentencePiecePreprocessor',
         ],
         'space': [
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index a753fe6c..f7478329 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor,
         SentencePiecePreprocessor,
     )
 
@@ -42,6 +43,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'TextGenerationJiebaPreprocessor',
             'SentencePiecePreprocessor',
         ],
         'text_error_correction': [
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 3d708634..267dbb8c 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -494,6 +494,41 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
+class TextGenerationJiebaPreprocessor(Preprocessor):
+    """The jieba tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
+        super().__init__(*args, **kwargs)
+        self.tokenizer = JiebaBPETokenizer(
+            osp.join(model_dir, 'tokenizer.json'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+        import torch
+
+        return {
+            'input_ids':
+            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
+        }
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp,
     module_name=Preprocessors.word_segment_text_to_label_preprocessor)
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
index 2b590a10..53332c0f 100755
--- a/modelscope/utils/nlp/distributed.py
+++ b/modelscope/utils/nlp/distributed.py
@@ -35,7 +35,10 @@ def initialize_distributed(rank, mpu, world_size, model_parallel_size,
     init_method = 'tcp://'
     init_method += master_ip + ':' + master_port
     torch.distributed.init_process_group(
-        backend='nccl', world_size=8, rank=rank, init_method=init_method)
+        backend='nccl',
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
     # Set the model-parallel communicators.
     mpu.initialize_model_parallel(model_parallel_size)
 
diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py
new file mode 100644
index 00000000..413b5874
--- /dev/null
+++ b/tests/pipelines/test_gpt3_text_generation.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TextGPT3GenerationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # please make sure this local path exists.
+        self.model_id_1_3B = 'damo/nlp_gpt3_text-generation_1.3B'
+        self.model_id_2_7B = 'damo/nlp_gpt3_text-generation_2.7B'
+        self.model_id_13B = 'damo/nlp_gpt3_text-generation_13B'
+        self.model_dir_13B = snapshot_download(self.model_id_13B)
+        self.input = '好的'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_gpt3_1_3B(self):
+        pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
+        print(pipe(self.input))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_gpt3_2_7B(self):
+        pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
+        print(pipe(self.input))
+
+    @unittest.skip('distributed gpt3 13B, skipped')
+    def test_gpt3_13B(self):
+        """ The model can be downloaded from the link on
+        TODO: add gpt3 checkpoint link
+        After downloading, you should have a gpt3 model structure like this:
+        nlp_gpt3_text-generation_13B
+            |_ config.json
+            |_ configuration.json
+            |_ tokenizer.json
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        """
+        pipe = pipeline(Tasks.text_generation, model=self.model_dir_13B)
+        print(pipe(self.input))
+
+
+if __name__ == '__main__':
+    unittest.main()

From cb570d586cb5f4a467de9aad1e058e3cd3276518 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Tue, 18 Oct 2022 16:10:10 +0800
Subject: [PATCH 57/57] add referring video object segmentation pipeline       
  Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10400324

---
 ...g_video_object_segmentation_test_video.mp4 |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |   3 +-
 .../__init__.py                               |  23 +
 .../model.py                                  |  65 ++
 .../utils/__init__.py                         |   4 +
 .../utils/backbone.py                         | 198 +++++
 .../utils/misc.py                             | 234 ++++++
 .../utils/mttr.py                             | 128 +++
 .../utils/multimodal_transformer.py           | 440 +++++++++++
 .../utils/position_encoding_2d.py             |  57 ++
 .../utils/postprocessing.py                   | 119 +++
 .../utils/segmentation.py                     | 137 ++++
 .../utils/swin_transformer.py                 | 731 ++++++++++++++++++
 modelscope/outputs.py                         |   6 +
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/cv/__init__.py           |   4 +
 ...ring_video_object_segmentation_pipeline.py | 193 +++++
 modelscope/utils/constant.py                  |   3 +
 requirements/cv.txt                           |   2 +
 ...est_referring_video_object_segmentation.py |  56 ++
 21 files changed, 2410 insertions(+), 1 deletion(-)
 create mode 100644 data/test/videos/referring_video_object_segmentation_test_video.mp4
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/model.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
 create mode 100644 modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
 create mode 100644 tests/pipelines/test_referring_video_object_segmentation.py

diff --git a/data/test/videos/referring_video_object_segmentation_test_video.mp4 b/data/test/videos/referring_video_object_segmentation_test_video.mp4
new file mode 100644
index 00000000..529595a5
--- /dev/null
+++ b/data/test/videos/referring_video_object_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb
+size 2924691
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 2dbff948..fc18ead9 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -34,6 +34,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
@@ -203,6 +204,7 @@ class Pipelines(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
     image_body_reshaping = 'flow-based-body-reshaping'
+    referring_video_object_segmentation = 'referring-video-object-segmentation'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index fd950f4c..64039863 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -12,7 +12,8 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_to_image_generation, image_to_image_translation,
                movie_scene_segmentation, object_detection,
                product_retrieval_embedding, realtime_object_detection,
-               salient_detection, shop_segmentation, super_resolution,
+               referring_video_object_segmentation, salient_detection,
+               shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/referring_video_object_segmentation/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/__init__.py
new file mode 100644
index 00000000..58dbf7b0
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import MovieSceneSegmentation
+
+else:
+    _import_structure = {
+        'model': ['MovieSceneSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/referring_video_object_segmentation/model.py b/modelscope/models/cv/referring_video_object_segmentation/model.py
new file mode 100644
index 00000000..902a3416
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/model.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .utils import (MTTR, A2DSentencesPostProcess, ReferYoutubeVOSPostProcess,
+                    nested_tensor_from_videos_list)
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.referring_video_object_segmentation,
+    module_name=Models.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+        self.model = MTTR(**self.cfg.model)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        params_dict = torch.load(model_path, map_location='cpu')
+        if 'model_state_dict' in params_dict.keys():
+            params_dict = params_dict['model_state_dict']
+        self.model.load_state_dict(params_dict, strict=True)
+
+        dataset_name = self.cfg.pipeline.dataset_name
+        if dataset_name == 'a2d_sentences' or dataset_name == 'jhmdb_sentences':
+            self.postprocessor = A2DSentencesPostProcess()
+        elif dataset_name == 'ref_youtube_vos':
+            self.postprocessor = ReferYoutubeVOSPostProcess()
+        else:
+            assert False, f'postprocessing for dataset: {dataset_name} is not supported'
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        return inputs
+
+    def inference(self, **kwargs):
+        window = kwargs['window']
+        text_query = kwargs['text_query']
+        video_metadata = kwargs['metadata']
+
+        window = nested_tensor_from_videos_list([window])
+        valid_indices = torch.arange(len(window.tensors))
+        if self._device_name == 'gpu':
+            valid_indices = valid_indices.cuda()
+        outputs = self.model(window, valid_indices, [text_query])
+        window_masks = self.postprocessor(
+            outputs, [video_metadata],
+            window.tensors.shape[-2:])[0]['pred_masks']
+        return window_masks
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        return inputs
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
new file mode 100644
index 00000000..796bd6f4
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .misc import nested_tensor_from_videos_list
+from .mttr import MTTR
+from .postprocessing import A2DSentencesPostProcess, ReferYoutubeVOSPostProcess
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
new file mode 100644
index 00000000..afa384c1
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
@@ -0,0 +1,198 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+from .misc import NestedTensor, is_main_process
+from .swin_transformer import SwinTransformer3D
+
+
+class VideoSwinTransformerBackbone(nn.Module):
+    """
+    A wrapper which allows using Video-Swin Transformer as a temporal encoder for MTTR.
+    Check out video-swin's original paper at: https://arxiv.org/abs/2106.13230 for more info about this architecture.
+    Only the 'tiny' version of video swin was tested and is currently supported in our project.
+    Additionally, we slightly modify video-swin to make it output per-frame embeddings as required by MTTR (check our
+    paper's supplementary for more details), and completely discard of its 4th block.
+    """
+
+    def __init__(self, backbone_pretrained, backbone_pretrained_path,
+                 train_backbone, running_mode, **kwargs):
+        super(VideoSwinTransformerBackbone, self).__init__()
+        # patch_size is (1, 4, 4) instead of the original (2, 4, 4).
+        # this prevents swinT's original temporal downsampling so we can get per-frame features.
+        swin_backbone = SwinTransformer3D(
+            patch_size=(1, 4, 4),
+            embed_dim=96,
+            depths=(2, 2, 6, 2),
+            num_heads=(3, 6, 12, 24),
+            window_size=(8, 7, 7),
+            drop_path_rate=0.1,
+            patch_norm=True)
+        if backbone_pretrained and running_mode == 'train':
+            state_dict = torch.load(backbone_pretrained_path)['state_dict']
+            # extract swinT's kinetics-400 pretrained weights and ignore the rest (prediction head etc.)
+            state_dict = {
+                k[9:]: v
+                for k, v in state_dict.items() if 'backbone.' in k
+            }
+
+            # sum over the patch embedding weight temporal dim  [96, 3, 2, 4, 4] --> [96, 3, 1, 4, 4]
+            patch_embed_weight = state_dict['patch_embed.proj.weight']
+            patch_embed_weight = patch_embed_weight.sum(dim=2, keepdims=True)
+            state_dict['patch_embed.proj.weight'] = patch_embed_weight
+            swin_backbone.load_state_dict(state_dict)
+
+        self.patch_embed = swin_backbone.patch_embed
+        self.pos_drop = swin_backbone.pos_drop
+        self.layers = swin_backbone.layers[:-1]
+        self.downsamples = nn.ModuleList()
+        for layer in self.layers:
+            self.downsamples.append(layer.downsample)
+            layer.downsample = None
+        self.downsamples[
+            -1] = None  # downsampling after the last layer is not necessary
+
+        self.layer_output_channels = [
+            swin_backbone.embed_dim * 2**i for i in range(len(self.layers))
+        ]
+        self.train_backbone = train_backbone
+        if not train_backbone:
+            for parameter in self.parameters():
+                parameter.requires_grad_(False)
+
+    def forward(self, samples: NestedTensor):
+        vid_frames = rearrange(samples.tensors, 't b c h w -> b c t h w')
+
+        vid_embeds = self.patch_embed(vid_frames)
+        vid_embeds = self.pos_drop(vid_embeds)
+        layer_outputs = []  # layer outputs before downsampling
+        for layer, downsample in zip(self.layers, self.downsamples):
+            vid_embeds = layer(vid_embeds.contiguous())
+            layer_outputs.append(vid_embeds)
+            if downsample:
+                vid_embeds = rearrange(vid_embeds, 'b c t h w -> b t h w c')
+                vid_embeds = downsample(vid_embeds)
+                vid_embeds = rearrange(vid_embeds, 'b t h w c -> b c t h w')
+        layer_outputs = [
+            rearrange(o, 'b c t h w -> t b c h w') for o in layer_outputs
+        ]
+
+        outputs = []
+        orig_pad_mask = samples.mask
+        for l_out in layer_outputs:
+            pad_mask = F.interpolate(
+                orig_pad_mask.float(), size=l_out.shape[-2:]).to(torch.bool)
+            outputs.append(NestedTensor(l_out, pad_mask))
+        return outputs
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    Modified from DETR https://github.com/facebookresearch/detr
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer('weight', torch.ones(n))
+        self.register_buffer('bias', torch.zeros(n))
+        self.register_buffer('running_mean', torch.zeros(n))
+        self.register_buffer('running_var', torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class ResNetBackbone(nn.Module):
+    """
+    Modified from DETR https://github.com/facebookresearch/detr
+    ResNet backbone with frozen BatchNorm.
+    """
+
+    def __init__(self,
+                 backbone_name: str = 'resnet50',
+                 train_backbone: bool = True,
+                 dilation: bool = True,
+                 **kwargs):
+        super(ResNetBackbone, self).__init__()
+        backbone = getattr(torchvision.models, backbone_name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(),
+            norm_layer=FrozenBatchNorm2d)
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        return_layers = {
+            'layer1': '0',
+            'layer2': '1',
+            'layer3': '2',
+            'layer4': '3'
+        }
+        self.body = IntermediateLayerGetter(
+            backbone, return_layers=return_layers)
+        output_channels = 512 if backbone_name in ('resnet18',
+                                                   'resnet34') else 2048
+        self.layer_output_channels = [
+            output_channels // 8, output_channels // 4, output_channels // 2,
+            output_channels
+        ]
+
+    def forward(self, tensor_list: NestedTensor):
+        t, b, _, _, _ = tensor_list.tensors.shape
+        video_frames = rearrange(tensor_list.tensors,
+                                 't b c h w -> (t b) c h w')
+        padding_masks = rearrange(tensor_list.mask, 't b h w -> (t b) h w')
+        features_list = self.body(video_frames)
+        out = []
+        for _, f in features_list.items():
+            resized_padding_masks = F.interpolate(
+                padding_masks[None].float(),
+                size=f.shape[-2:]).to(torch.bool)[0]
+            f = rearrange(f, '(t b) c h w -> t b c h w', t=t, b=b)
+            resized_padding_masks = rearrange(
+                resized_padding_masks, '(t b) h w -> t b h w', t=t, b=b)
+            out.append(NestedTensor(f, resized_padding_masks))
+        return out
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+def init_backbone(backbone_name, **kwargs):
+    if backbone_name == 'swin-t':
+        return VideoSwinTransformerBackbone(**kwargs)
+    elif 'resnet' in backbone_name:
+        return ResNetBackbone(backbone_name, **kwargs)
+    assert False, f'error: backbone "{backbone_name}" is not supported'
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
new file mode 100644
index 00000000..ecf34b8c
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
@@ -0,0 +1,234 @@
+# Modified from DETR https://github.com/facebookresearch/detr
+# Misc functions.
+# Mostly copy-paste from torchvision references.
+
+import pickle
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+from torch import Tensor
+
+if float(torchvision.__version__.split('.')[1]) < 7.0:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(
+            size=(max_size - local_size, ), dtype=torch.uint8, device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    """
+    This function receives a list of image tensors and returns a NestedTensor of the padded images, along with their
+    padding masks (true for padding areas, false otherwise).
+    """
+    max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+    batch_shape = [len(tensor_list)] + max_size
+    b, c, h, w = batch_shape
+    dtype = tensor_list[0].dtype
+    device = tensor_list[0].device
+    tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+    mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+    for img, pad_img, m in zip(tensor_list, tensor, mask):
+        pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img)
+        m[:img.shape[1], :img.shape[2]] = False
+    return NestedTensor(tensor, mask)
+
+
+def nested_tensor_from_videos_list(videos_list: List[Tensor]):
+    """
+    This function receives a list of videos (each of shape [T, C, H, W]) and returns a NestedTensor of the padded
+    videos (shape [T, B, C, PH, PW], along with their padding masks (true for padding areas, false otherwise, of shape
+    [T, B, PH, PW].
+    """
+    max_size = _max_by_axis([list(img.shape) for img in videos_list])
+    padded_batch_shape = [len(videos_list)] + max_size
+    b, t, c, h, w = padded_batch_shape
+    dtype = videos_list[0].dtype
+    device = videos_list[0].device
+    padded_videos = torch.zeros(padded_batch_shape, dtype=dtype, device=device)
+    videos_pad_masks = torch.ones((b, t, h, w),
+                                  dtype=torch.bool,
+                                  device=device)
+    for vid_frames, pad_vid_frames, vid_pad_m in zip(videos_list,
+                                                     padded_videos,
+                                                     videos_pad_masks):
+        pad_vid_frames[:vid_frames.shape[0], :, :vid_frames.
+                       shape[2], :vid_frames.shape[3]].copy_(vid_frames)
+        vid_pad_m[:vid_frames.shape[0], :vid_frames.shape[2], :vid_frames.
+                  shape[3]] = False
+    # transpose the temporal and batch dims and create a NestedTensor:
+    return NestedTensor(
+        padded_videos.transpose(0, 1), videos_pad_masks.transpose(0, 1))
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def interpolate(input,
+                size=None,
+                scale_factor=None,
+                mode='nearest',
+                align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if float(torchvision.__version__.split('.')[1]) < 7.0:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor,
+                                                   mode, align_corners)
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor,
+                                                mode, align_corners)
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
new file mode 100644
index 00000000..e603df6c
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
@@ -0,0 +1,128 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+from .backbone import init_backbone
+from .misc import NestedTensor
+from .multimodal_transformer import MultimodalTransformer
+from .segmentation import FPNSpatialDecoder
+
+
+class MTTR(nn.Module):
+    """ The main module of the Multimodal Tracking Transformer """
+
+    def __init__(self,
+                 num_queries,
+                 mask_kernels_dim=8,
+                 aux_loss=False,
+                 **kwargs):
+        """
+        Parameters:
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         MTTR can detect in a single image. In our paper we use 50 in all settings.
+            mask_kernels_dim: dim of the segmentation kernels and of the feature maps outputted by the spatial decoder.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.backbone = init_backbone(**kwargs)
+        self.transformer = MultimodalTransformer(**kwargs)
+        d_model = self.transformer.d_model
+        self.is_referred_head = nn.Linear(
+            d_model,
+            2)  # binary 'is referred?' prediction head for object queries
+        self.instance_kernels_head = MLP(
+            d_model, d_model, output_dim=mask_kernels_dim, num_layers=2)
+        self.obj_queries = nn.Embedding(
+            num_queries, d_model)  # pos embeddings for the object queries
+        self.vid_embed_proj = nn.Conv2d(
+            self.backbone.layer_output_channels[-1], d_model, kernel_size=1)
+        self.spatial_decoder = FPNSpatialDecoder(
+            d_model, self.backbone.layer_output_channels[:-1][::-1],
+            mask_kernels_dim)
+        self.aux_loss = aux_loss
+
+    def forward(self, samples: NestedTensor, valid_indices, text_queries):
+        """The forward expects a NestedTensor, which consists of:
+               - samples.tensor: Batched frames of shape [time x batch_size x 3 x H x W]
+               - samples.mask: A binary mask of shape [time x batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_is_referred": The reference prediction logits for all queries.
+                                     Shape: [time x batch_size x num_queries x 2]
+               - "pred_masks": The mask logits for all queries.
+                               Shape: [time x batch_size x num_queries x H_mask x W_mask]
+               - "aux_outputs": Optional, only returned when auxiliary losses are activated. It is a list of
+                                dictionaries containing the two above keys for each decoder layer.
+        """
+        backbone_out = self.backbone(samples)
+        # keep only the valid frames (frames which are annotated):
+        # (for example, in a2d-sentences only the center frame in each window is annotated).
+        for layer_out in backbone_out:
+            layer_out.tensors = layer_out.tensors.index_select(
+                0, valid_indices)
+            layer_out.mask = layer_out.mask.index_select(0, valid_indices)
+        bbone_final_layer_output = backbone_out[-1]
+        vid_embeds, vid_pad_mask = bbone_final_layer_output.decompose()
+
+        T, B, _, _, _ = vid_embeds.shape
+        vid_embeds = rearrange(vid_embeds, 't b c h w -> (t b) c h w')
+        vid_embeds = self.vid_embed_proj(vid_embeds)
+        vid_embeds = rearrange(
+            vid_embeds, '(t b) c h w -> t b c h w', t=T, b=B)
+
+        transformer_out = self.transformer(vid_embeds, vid_pad_mask,
+                                           text_queries,
+                                           self.obj_queries.weight)
+        # hs is: [L, T, B, N, D] where L is number of decoder layers
+        # vid_memory is: [T, B, D, H, W]
+        # txt_memory is a list of length T*B of [S, C] where S might be different for each sentence
+        # encoder_middle_layer_outputs is a list of [T, B, H, W, D]
+        hs, vid_memory, txt_memory = transformer_out
+
+        vid_memory = rearrange(vid_memory, 't b d h w -> (t b) d h w')
+        bbone_middle_layer_outputs = [
+            rearrange(o.tensors, 't b d h w -> (t b) d h w')
+            for o in backbone_out[:-1][::-1]
+        ]
+        decoded_frame_features = self.spatial_decoder(
+            vid_memory, bbone_middle_layer_outputs)
+        decoded_frame_features = rearrange(
+            decoded_frame_features, '(t b) d h w -> t b d h w', t=T, b=B)
+        instance_kernels = self.instance_kernels_head(hs)  # [L, T, B, N, C]
+        # output masks is: [L, T, B, N, H_mask, W_mask]
+        output_masks = torch.einsum('ltbnc,tbchw->ltbnhw', instance_kernels,
+                                    decoded_frame_features)
+        outputs_is_referred = self.is_referred_head(hs)  # [L, T, B, N, 2]
+
+        layer_outputs = []
+        for pm, pir in zip(output_masks, outputs_is_referred):
+            layer_out = {'pred_masks': pm, 'pred_is_referred': pir}
+            layer_outputs.append(layer_out)
+        out = layer_outputs[
+            -1]  # the output for the last decoder layer is used by default
+        if self.aux_loss:
+            out['aux_outputs'] = layer_outputs[:-1]
+        return out
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
new file mode 100644
index 00000000..8c24e397
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
@@ -0,0 +1,440 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# MTTR Multimodal Transformer class.
+# Modified from DETR https://github.com/facebookresearch/detr
+
+import copy
+import os
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor, nn
+from transformers import RobertaModel, RobertaTokenizerFast
+
+from .position_encoding_2d import PositionEmbeddingSine2D
+
+os.environ[
+    'TOKENIZERS_PARALLELISM'] = 'false'  # this disables a huggingface tokenizer warning (printed every epoch)
+
+
+class MultimodalTransformer(nn.Module):
+
+    def __init__(self,
+                 num_encoder_layers=3,
+                 num_decoder_layers=3,
+                 text_encoder_type='roberta-base',
+                 freeze_text_encoder=True,
+                 **kwargs):
+        super().__init__()
+        self.d_model = kwargs['d_model']
+        encoder_layer = TransformerEncoderLayer(**kwargs)
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = TransformerDecoderLayer(**kwargs)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            norm=nn.LayerNorm(self.d_model),
+            return_intermediate=True)
+        self.pos_encoder_2d = PositionEmbeddingSine2D()
+        self._reset_parameters()
+
+        self.text_encoder = RobertaModel.from_pretrained(text_encoder_type)
+        self.text_encoder.pooler = None  # this pooler is never used, this is a hack to avoid DDP problems...
+        self.tokenizer = RobertaTokenizerFast.from_pretrained(
+            text_encoder_type)
+        self.freeze_text_encoder = freeze_text_encoder
+        if freeze_text_encoder:
+            for p in self.text_encoder.parameters():
+                p.requires_grad_(False)
+
+        self.txt_proj = FeatureResizer(
+            input_feat_size=self.text_encoder.config.hidden_size,
+            output_feat_size=self.d_model,
+            dropout=kwargs['dropout'],
+        )
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, vid_embeds, vid_pad_mask, text_queries, obj_queries):
+        device = vid_embeds.device
+        t, b, _, h, w = vid_embeds.shape
+
+        txt_memory, txt_pad_mask = self.forward_text(text_queries, device)
+        # add temporal dim to txt memory & padding mask:
+        txt_memory = repeat(txt_memory, 's b c -> s (t b) c', t=t)
+        txt_pad_mask = repeat(txt_pad_mask, 'b s -> (t b) s', t=t)
+
+        vid_embeds = rearrange(vid_embeds, 't b c h w -> (h w) (t b) c')
+        # Concat the image & text embeddings on the sequence dimension
+        encoder_src_seq = torch.cat((vid_embeds, txt_memory), dim=0)
+        seq_mask = torch.cat(
+            (rearrange(vid_pad_mask, 't b h w -> (t b) (h w)'), txt_pad_mask),
+            dim=1)
+        # vid_pos_embed is: [T*B, H, W, d_model]
+        vid_pos_embed = self.pos_encoder_2d(
+            rearrange(vid_pad_mask, 't b h w -> (t b) h w'), self.d_model)
+        # use zeros in place of pos embeds for the text sequence:
+        pos_embed = torch.cat(
+            (rearrange(vid_pos_embed, 't_b h w c -> (h w) t_b c'),
+             torch.zeros_like(txt_memory)),
+            dim=0)
+
+        memory = self.encoder(
+            encoder_src_seq, src_key_padding_mask=seq_mask,
+            pos=pos_embed)  # [S, T*B, C]
+        vid_memory = rearrange(
+            memory[:h * w, :, :],
+            '(h w) (t b) c -> t b c h w',
+            h=h,
+            w=w,
+            t=t,
+            b=b)
+        txt_memory = memory[h * w:, :, :]
+        txt_memory = rearrange(txt_memory, 's t_b c -> t_b s c')
+        txt_memory = [
+            t_mem[~pad_mask]
+            for t_mem, pad_mask in zip(txt_memory, txt_pad_mask)
+        ]  # remove padding
+
+        # add T*B dims to query embeds (was: [N, C], where N is the number of object queries):
+        obj_queries = repeat(obj_queries, 'n c -> n (t b) c', t=t, b=b)
+        tgt = torch.zeros_like(obj_queries)  # [N, T*B, C]
+
+        # hs is [L, N, T*B, C] where L is number of layers in the decoder
+        hs = self.decoder(
+            tgt,
+            memory,
+            memory_key_padding_mask=seq_mask,
+            pos=pos_embed,
+            query_pos=obj_queries)
+        hs = rearrange(hs, 'l n (t b) c -> l t b n c', t=t, b=b)
+        return hs, vid_memory, txt_memory
+
+    def forward_text(self, text_queries, device):
+        tokenized_queries = self.tokenizer.batch_encode_plus(
+            text_queries, padding='longest', return_tensors='pt')
+        tokenized_queries = tokenized_queries.to(device)
+        with torch.inference_mode(mode=self.freeze_text_encoder):
+            encoded_text = self.text_encoder(**tokenized_queries)
+        # Transpose memory because pytorch's attention expects sequence first
+        txt_memory = rearrange(encoded_text.last_hidden_state,
+                               'b s c -> s b c')
+        txt_memory = self.txt_proj(
+            txt_memory)  # change text embeddings dim to model dim
+        # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer
+        txt_pad_mask = tokenized_queries.attention_mask.ne(1).bool()  # [B, S]
+        return txt_memory, txt_pad_mask
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                src_mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nheads,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False,
+                 **kwargs):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self,
+                    src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src2,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self,
+                src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nheads,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False,
+                 **kwargs):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     tgt,
+                     memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self,
+                    tgt,
+                    memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt2,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask,
+                                 pos, query_pos)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
new file mode 100644
index 00000000..f9ef05a1
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
@@ -0,0 +1,57 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+# 2D sine positional encodings for the visual features in the multimodal transformer.
+
+import math
+
+import torch
+from torch import Tensor, nn
+
+
+class PositionEmbeddingSine2D(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, temperature=10000, normalize=True, scale=None):
+        super().__init__()
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, mask: Tensor, hidden_dim: int):
+        """
+        @param mask: a tensor of shape [B, H, W]
+        @param hidden_dim: int
+        @return:
+        """
+        num_pos_feats = hidden_dim // 2
+
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3)
+        return pos
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
new file mode 100644
index 00000000..64582140
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
@@ -0,0 +1,119 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+class A2DSentencesPostProcess(nn.Module):
+    """
+    This module converts the model's output into the format expected by the coco api for the given task
+    """
+
+    def __init__(self):
+        super(A2DSentencesPostProcess, self).__init__()
+
+    @torch.inference_mode()
+    def forward(self, outputs, resized_padded_sample_size,
+                resized_sample_sizes, orig_sample_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            resized_padded_sample_size: size of samples (input to model) after size augmentation + padding.
+            resized_sample_sizes: size of samples after size augmentation but without padding.
+            orig_sample_sizes: original size of the samples (no augmentations or padding)
+        """
+        pred_is_referred = outputs['pred_is_referred']
+        prob = F.softmax(pred_is_referred, dim=-1)
+        scores = prob[..., 0]
+        pred_masks = outputs['pred_masks']
+        pred_masks = F.interpolate(
+            pred_masks,
+            size=resized_padded_sample_size,
+            mode='bilinear',
+            align_corners=False)
+        pred_masks = (pred_masks.sigmoid() > 0.5)
+        processed_pred_masks, rle_masks = [], []
+        for f_pred_masks, resized_size, orig_size in zip(
+                pred_masks, resized_sample_sizes, orig_sample_sizes):
+            f_mask_h, f_mask_w = resized_size  # resized shape without padding
+            # remove the samples' padding
+            f_pred_masks_no_pad = f_pred_masks[:, :f_mask_h, :
+                                               f_mask_w].unsqueeze(1)
+            # resize the samples back to their original dataset (target) size for evaluation
+            f_pred_masks_processed = F.interpolate(
+                f_pred_masks_no_pad.float(), size=orig_size, mode='nearest')
+            f_pred_rle_masks = [
+                mask_util.encode(
+                    np.array(
+                        mask[0, :, :, np.newaxis], dtype=np.uint8,
+                        order='F'))[0]
+                for mask in f_pred_masks_processed.cpu()
+            ]
+            processed_pred_masks.append(f_pred_masks_processed)
+            rle_masks.append(f_pred_rle_masks)
+        predictions = [{
+            'scores': s,
+            'masks': m,
+            'rle_masks': rle
+        } for s, m, rle in zip(scores, processed_pred_masks, rle_masks)]
+        return predictions
+
+
+class ReferYoutubeVOSPostProcess(nn.Module):
+    """
+    This module converts the model's output into the format expected by the coco api for the given task
+    """
+
+    def __init__(self):
+        super(ReferYoutubeVOSPostProcess, self).__init__()
+
+    @torch.inference_mode()
+    def forward(self, outputs, videos_metadata, samples_shape_with_padding):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            videos_metadata: a dictionary with each video's metadata.
+            samples_shape_with_padding: size of the batch frames with padding.
+        """
+        pred_is_referred = outputs['pred_is_referred']
+        prob_is_referred = F.softmax(pred_is_referred, dim=-1)
+        # note we average on the temporal dim to compute score per trajectory:
+        trajectory_scores = prob_is_referred[..., 0].mean(dim=0)
+        pred_trajectory_indices = torch.argmax(trajectory_scores, dim=-1)
+        pred_masks = rearrange(outputs['pred_masks'],
+                               't b nq h w -> b t nq h w')
+        # keep only the masks of the chosen trajectories:
+        b = pred_masks.shape[0]
+        pred_masks = pred_masks[torch.arange(b), :, pred_trajectory_indices]
+        # resize the predicted masks to the size of the model input (which might include padding)
+        pred_masks = F.interpolate(
+            pred_masks,
+            size=samples_shape_with_padding,
+            mode='bilinear',
+            align_corners=False)
+        # apply a threshold to create binary masks:
+        pred_masks = (pred_masks.sigmoid() > 0.5)
+        # remove the padding per video (as videos might have different resolutions and thus different padding):
+        preds_by_video = []
+        for video_pred_masks, video_metadata in zip(pred_masks,
+                                                    videos_metadata):
+            # size of the model input batch frames without padding:
+            resized_h, resized_w = video_metadata['resized_frame_size']
+            video_pred_masks = video_pred_masks[:, :resized_h, :
+                                                resized_w].unsqueeze(
+                                                    1)  # remove the padding
+            # resize the masks back to their original frames dataset size for evaluation:
+            original_frames_size = video_metadata['original_frame_size']
+            tuple_size = tuple(original_frames_size.cpu().numpy())
+            video_pred_masks = F.interpolate(
+                video_pred_masks.float(), size=tuple_size, mode='nearest')
+            video_pred_masks = video_pred_masks.to(torch.uint8).cpu()
+            # combine the predicted masks and the video metadata to create a final predictions dict:
+            video_pred = {**video_metadata, **{'pred_masks': video_pred_masks}}
+            preds_by_video.append(video_pred)
+        return preds_by_video
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
new file mode 100644
index 00000000..b3228820
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
@@ -0,0 +1,137 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class FPNSpatialDecoder(nn.Module):
+    """
+    An FPN-like spatial decoder. Generates high-res, semantically rich features which serve as the base for creating
+    instance segmentation masks.
+    """
+
+    def __init__(self, context_dim, fpn_dims, mask_kernels_dim=8):
+        super().__init__()
+
+        inter_dims = [
+            context_dim, context_dim // 2, context_dim // 4, context_dim // 8,
+            context_dim // 16
+        ]
+        self.lay1 = torch.nn.Conv2d(context_dim, inter_dims[0], 3, padding=1)
+        self.gn1 = torch.nn.GroupNorm(8, inter_dims[0])
+        self.lay2 = torch.nn.Conv2d(inter_dims[0], inter_dims[1], 3, padding=1)
+        self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
+        self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.context_dim = context_dim
+
+        self.add_extra_layer = len(fpn_dims) == 3
+        if self.add_extra_layer:
+            self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+            self.lay5 = torch.nn.Conv2d(
+                inter_dims[3], inter_dims[4], 3, padding=1)
+            self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
+            self.out_lay = torch.nn.Conv2d(
+                inter_dims[4], mask_kernels_dim, 3, padding=1)
+        else:
+            self.out_lay = torch.nn.Conv2d(
+                inter_dims[3], mask_kernels_dim, 3, padding=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, layer_features: List[Tensor]):
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = F.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter1(layer_features[0])
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest')
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter2(layer_features[1])
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest')
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = F.relu(x)
+
+        if self.add_extra_layer:
+            cur_fpn = self.adapter3(layer_features[2])
+            x = cur_fpn + F.interpolate(
+                x, size=cur_fpn.shape[-2:], mode='nearest')
+            x = self.lay5(x)
+            x = self.gn5(x)
+            x = F.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+def dice_loss(inputs, targets, num_masks):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+
+
+def sigmoid_focal_loss(inputs,
+                       targets,
+                       num_masks,
+                       alpha: float = 0.25,
+                       gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction='none')
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_masks
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
new file mode 100644
index 00000000..9a08ef48
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
@@ -0,0 +1,731 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from Video-Swin-Transformer https://github.com/SwinTransformer/Video-Swin-Transformer
+
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath, trunc_normal_
+
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, D, H, W, C)
+        window_size (tuple[int]): window size
+
+    Returns:
+        windows: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1],
+               window_size[1], W // window_size[2], window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6,
+                        7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1],
+                     W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        wd, wh, ww = window_size
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * wd - 1) * (2 * wh - 1) * (2 * ww - 1), num_heads))
+
+        # get pair-wise relative position index for each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(torch.meshgrid(coords_d, coords_h,
+                                            coords_w))  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1]
+                                     - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape(-1)].reshape(
+                N, N, -1)  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(nn.Module):
+    """ Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(
+                x,
+                shifts=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C, )))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(
+                shifted_x,
+                shifts=(shift_size[0], shift_size[1], shift_size[2]),
+                dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size, device):
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                      float(-100.0)).masked_fill(
+                                          attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.view(B, D, H, W, -1)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b c d h w')
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """ Video to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x,
+                (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer3D(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 pretrained2d=True,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def inflate_weights(self, logger):
+        """Inflate the swin2d parameters to swin3d.
+
+        The differences between swin3d and swin2d mainly lie in an extra
+        axis. To utilize the pretrained parameters in 2d model,
+        the weight of swin2d models should be inflated to fit in the shapes of
+        the 3d counterpart.
+
+        Args:
+            logger (logging.Logger): The logger used to print
+                debugging infomation.
+        """
+        checkpoint = torch.load(self.pretrained, map_location='cpu')
+        state_dict = checkpoint['model']
+
+        # delete relative_position_index since we always re-init it
+        relative_position_index_keys = [
+            k for k in state_dict.keys() if 'relative_position_index' in k
+        ]
+        for k in relative_position_index_keys:
+            del state_dict[k]
+
+        # delete attn_mask since we always re-init it
+        attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+        for k in attn_mask_keys:
+            del state_dict[k]
+
+        state_dict['patch_embed.proj.weight'] = state_dict[
+            'patch_embed.proj.weight'].unsqueeze(2).repeat(
+                1, 1, self.patch_size[0], 1, 1) / self.patch_size[0]
+
+        # bicubic interpolate relative_position_bias_table if not match
+        relative_position_bias_table_keys = [
+            k for k in state_dict.keys() if 'relative_position_bias_table' in k
+        ]
+        for k in relative_position_bias_table_keys:
+            relative_position_bias_table_pretrained = state_dict[k]
+            relative_position_bias_table_current = self.state_dict()[k]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            wd = self.window_size[0]
+            if nH1 != nH2:
+                logger.warning(f'Error in loading {k}, passing')
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.permute(
+                            1, 0).view(1, nH1, S1, S1),
+                        size=(2 * self.window_size[1] - 1,
+                              2 * self.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0)
+            state_dict[k] = relative_position_bias_table_pretrained.repeat(
+                2 * wd - 1, 1)
+
+        msg = self.load_state_dict(state_dict, strict=False)
+        logger.info(msg)
+        logger.info(f"=> loaded successfully '{self.pretrained}'")
+        del checkpoint
+        torch.cuda.empty_cache()
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x.contiguous())
+
+        x = rearrange(x, 'n c d h w -> n d h w c')
+        x = self.norm(x)
+        x = rearrange(x, 'n d h w c -> n c d h w')
+
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index a49ddacf..fbe15646 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -417,6 +417,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.video_summarization: [OutputKeys.OUTPUT],
 
+    # referring video object segmentation result for a single video
+    #   {
+    #       "masks": [np.array # 2D array with shape [height, width]]
+    #   }
+    Tasks.referring_video_object_segmentation: [OutputKeys.MASKS],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 174d10b1..8098bdec 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -202,6 +202,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
     Tasks.product_segmentation: (Pipelines.product_segmentation,
                                  'damo/cv_F3Net_product-segmentation'),
+    Tasks.referring_video_object_segmentation:
+    (Pipelines.referring_video_object_segmentation,
+     'damo/cv_swin-t_referring_video-object-segmentation'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index f84f5fe5..97cd8761 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -58,6 +58,7 @@ if TYPE_CHECKING:
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
     from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
     from .hand_static_pipeline import HandStaticPipeline
+    from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline
 
 else:
     _import_structure = {
@@ -128,6 +129,9 @@ else:
         ['FacialExpressionRecognitionPipeline'],
         'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
         'hand_static_pipeline': ['HandStaticPipeline'],
+        'referring_video_object_segmentation_pipeline': [
+            'ReferringVideoObjectSegmentationPipeline'
+        ],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
new file mode 100644
index 00000000..d264b386
--- /dev/null
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -0,0 +1,193 @@
+# The implementation here is modified based on MTTR,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/mttr2021/MTTR
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms.functional as F
+from einops import rearrange
+from moviepy.editor import AudioFileClip, ImageSequenceClip, VideoFileClip
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.referring_video_object_segmentation,
+    module_name=Pipelines.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a referring video object segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            self.device = 'gpu'
+        else:
+            self.device = 'cpu'
+        super().__init__(model=model, device=self.device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """
+
+        Args:
+            input: path of the input video
+
+        """
+        assert isinstance(input, tuple) and len(
+            input
+        ) == 4, 'error - input type must be tuple and input length must be 4'
+        self.input_video_pth, text_queries, start_pt, end_pt = input
+
+        assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long'
+        assert 1 <= len(
+            text_queries) <= 2, 'error - 1-2 input text queries are expected'
+
+        # extract the relevant subclip:
+        self.input_clip_pth = 'input_clip.mp4'
+        with VideoFileClip(self.input_video_pth) as video:
+            subclip = video.subclip(start_pt, end_pt)
+            subclip.write_videofile(self.input_clip_pth)
+
+        self.window_length = 24  # length of window during inference
+        self.window_overlap = 6  # overlap (in frames) between consecutive windows
+
+        self.video, audio, self.meta = torchvision.io.read_video(
+            filename=self.input_clip_pth)
+        self.video = rearrange(self.video, 't h w c -> t c h w')
+
+        input_video = F.resize(self.video, size=360, max_size=640)
+        if self.device_name == 'gpu':
+            input_video = input_video.cuda()
+
+        input_video = input_video.to(torch.float).div_(255)
+        input_video = F.normalize(
+            input_video, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        video_metadata = {
+            'resized_frame_size': input_video.shape[-2:],
+            'original_frame_size': self.video.shape[-2:]
+        }
+
+        # partition the clip into overlapping windows of frames:
+        windows = [
+            input_video[i:i + self.window_length]
+            for i in range(0, len(input_video), self.window_length
+                           - self.window_overlap)
+        ]
+        # clean up the text queries:
+        self.text_queries = [' '.join(q.lower().split()) for q in text_queries]
+
+        result = {
+            'text_queries': self.text_queries,
+            'windows': windows,
+            'video_metadata': video_metadata
+        }
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            pred_masks_per_query = []
+            t, _, h, w = self.video.shape
+            for text_query in tqdm(input['text_queries'], desc='text queries'):
+                pred_masks = torch.zeros(size=(t, 1, h, w))
+                for i, window in enumerate(
+                        tqdm(input['windows'], desc='windows')):
+
+                    window_masks = self.model.inference(
+                        window=window,
+                        text_query=text_query,
+                        metadata=input['video_metadata'])
+
+                    win_start_idx = i * (
+                        self.window_length - self.window_overlap)
+                    pred_masks[win_start_idx:win_start_idx
+                               + self.window_length] = window_masks
+                pred_masks_per_query.append(pred_masks)
+        return pred_masks_per_query
+
+    def postprocess(self, inputs) -> Dict[str, Any]:
+        if self.model.cfg.pipeline.save_masked_video:
+            # RGB colors for instance masks:
+            light_blue = (41, 171, 226)
+            purple = (237, 30, 121)
+            dark_green = (35, 161, 90)
+            orange = (255, 148, 59)
+            colors = np.array([light_blue, purple, dark_green, orange])
+
+            # width (in pixels) of the black strip above the video on which the text queries will be displayed:
+            text_border_height_per_query = 36
+
+            video_np = rearrange(self.video,
+                                 't c h w -> t h w c').numpy() / 255.0
+
+            # del video
+            pred_masks_per_frame = rearrange(
+                torch.stack(inputs), 'q t 1 h w -> t q h w').numpy()
+            masked_video = []
+            for vid_frame, frame_masks in tqdm(
+                    zip(video_np, pred_masks_per_frame),
+                    total=len(video_np),
+                    desc='applying masks...'):
+                # apply the masks:
+                for inst_mask, color in zip(frame_masks, colors):
+                    vid_frame = apply_mask(vid_frame, inst_mask, color / 255.0)
+                vid_frame = Image.fromarray((vid_frame * 255).astype(np.uint8))
+                # visualize the text queries:
+                vid_frame = ImageOps.expand(
+                    vid_frame,
+                    border=(0, len(self.text_queries)
+                            * text_border_height_per_query, 0, 0))
+                W, H = vid_frame.size
+                draw = ImageDraw.Draw(vid_frame)
+                font = ImageFont.truetype(font='DejaVuSansMono.ttf', size=30)
+                for i, (text_query, color) in enumerate(
+                        zip(self.text_queries, colors), start=1):
+                    w, h = draw.textsize(text_query, font=font)
+                    draw.text(((W - w) / 2,
+                               (text_border_height_per_query * i) - h - 3),
+                              text_query,
+                              fill=tuple(color) + (255, ),
+                              font=font)
+                masked_video.append(np.array(vid_frame))
+            print(type(vid_frame))
+            print(type(masked_video[0]))
+            print(masked_video[0].shape)
+            # generate and save the output clip:
+
+            assert self.model.cfg.pipeline.output_path
+            output_clip_path = self.model.cfg.pipeline.output_path
+            clip = ImageSequenceClip(
+                sequence=masked_video, fps=self.meta['video_fps'])
+            clip = clip.set_audio(AudioFileClip(self.input_clip_pth))
+            clip.write_videofile(
+                output_clip_path, fps=self.meta['video_fps'], audio=True)
+            del masked_video
+
+        result = {OutputKeys.MASKS: inputs}
+        return result
+
+
+def apply_mask(image, mask, color, transparency=0.7):
+    mask = mask[..., np.newaxis].repeat(repeats=3, axis=2)
+    mask = mask * transparency
+    color_matrix = np.ones(image.shape, dtype=np.float) * color
+    out_image = color_matrix * mask + image * (1.0 - mask)
+    return out_image
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 0eb369da..6ba58c19 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -80,6 +80,9 @@ class CVTasks(object):
     virtual_try_on = 'virtual-try-on'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
+    # video segmentation
+    referring_video_object_segmentation = 'referring-video-object-segmentation'
+
     # video editing
     video_inpainting = 'video-inpainting'
 
diff --git a/requirements/cv.txt b/requirements/cv.txt
index eb38beb1..d23fab3a 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1,4 +1,5 @@
 albumentations>=1.0.3
+av>=9.2.0
 easydict
 fairscale>=0.4.1
 fastai>=1.0.51
@@ -14,6 +15,7 @@ lpips
 ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0
+moviepy>=1.0.3
 networkx>=2.5
 numba
 onnxruntime>=1.10
diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py
new file mode 100644
index 00000000..3e81d9c3
--- /dev/null
+++ b/tests/pipelines/test_referring_video_object_segmentation.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ReferringVideoObjectSegmentationTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.referring_video_object_segmentation
+        self.model_id = 'damo/cv_swin-t_referring_video-object-segmentation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_referring_video_object_segmentation(self):
+        input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
+        text_queries = [
+            'guy in black performing tricks on a bike',
+            'a black bike used to perform tricks'
+        ]
+        start_pt, end_pt = 4, 14
+        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        pp = pipeline(
+            Tasks.referring_video_object_segmentation, model=self.model_id)
+        result = pp(input_tuple)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_referring_video_object_segmentation_with_default_task(self):
+        input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
+        text_queries = [
+            'guy in black performing tricks on a bike',
+            'a black bike used to perform tricks'
+        ]
+        start_pt, end_pt = 4, 14
+        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        pp = pipeline(Tasks.referring_video_object_segmentation)
+        result = pp(input_tuple)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()