From cdebef46892b9047c9d8a1954c41d97c51f35d73 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Wed, 24 Aug 2022 15:05:16 +0800
Subject: [PATCH 001/175] =?UTF-8?q?[to=20#42322933]panoptic=20segmentation?=
 =?UTF-8?q?=20=E6=A8=A1=E5=9E=8B=E6=8E=A5=E5=85=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

panoptic segmentation 模型接入
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9758389
---
 .../images/image_panoptic_segmentation.jpg    |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |  11 +-
 .../image_panoptic_segmentation/__init__.py   |  22 ++++
 .../panseg_model.py                           |  54 +++++++++
 modelscope/pipelines/cv/__init__.py           |   4 +
 .../image_panoptic_segmentation_pipeline.py   | 103 ++++++++++++++++++
 modelscope/utils/cv/image_utils.py            |  19 ++++
 .../test_image_panoptic_segmentation.py       |  40 +++++++
 9 files changed, 253 insertions(+), 5 deletions(-)
 create mode 100644 data/test/images/image_panoptic_segmentation.jpg
 create mode 100644 modelscope/models/cv/image_panoptic_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
 create mode 100644 modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
 create mode 100644 tests/pipelines/test_image_panoptic_segmentation.py

diff --git a/data/test/images/image_panoptic_segmentation.jpg b/data/test/images/image_panoptic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_panoptic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index d0684ecd..1fba50b3 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -20,6 +20,7 @@ class Models(object):
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
+    panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
     video_summarization = 'pgl-video-summarization'
 
@@ -114,6 +115,7 @@ class Pipelines(object):
     tinynas_classification = 'tinynas-classification'
     crowd_counting = 'hrnet-crowd-counting'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
+    image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
     image_reid_person = 'passvitb-image-reid-person'
 
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 168ac96c..3af7a1b6 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -3,8 +3,9 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
                face_generation, image_classification, image_color_enhance,
                image_colorization, image_denoise, image_instance_segmentation,
-               image_portrait_enhancement, image_reid_person,
-               image_to_image_generation, image_to_image_translation,
-               object_detection, product_retrieval_embedding,
-               salient_detection, super_resolution,
-               video_single_object_tracking, video_summarization, virual_tryon)
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_to_image_generation,
+               image_to_image_translation, object_detection,
+               product_retrieval_embedding, salient_detection,
+               super_resolution, video_single_object_tracking,
+               video_summarization, virual_tryon)
diff --git a/modelscope/models/cv/image_panoptic_segmentation/__init__.py b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
new file mode 100644
index 00000000..2b2be4b7
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .panseg_model import SwinLPanopticSegmentation
+
+else:
+    _import_structure = {
+        'panseg_model': ['SwinLPanopticSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
new file mode 100644
index 00000000..f9022f90
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -0,0 +1,54 @@
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.panoptic_segmentation)
+class SwinLPanopticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'config.py')
+
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = load_checkpoint(
+            self.model, model_path, map_location='cpu')
+
+        self.CLASSES = checkpoint['meta']['CLASSES']
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def inference(self, data):
+        """data is dict,contain img and img_metas,follow with mmdet."""
+
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+        return results
+
+    def forward(self, Inputs):
+        import pdb
+        pdb.set_trace()
+        return self.model(**Inputs)
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 4ff1b856..d084a91b 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
     from .image_denoise_pipeline import ImageDenoisePipeline
     from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
     from .image_matting_pipeline import ImageMattingPipeline
+    from .image_panoptic_segmentation_pipeline import ImagePanopticSegmentationPipeline
     from .image_portrait_enhancement_pipeline import ImagePortraitEnhancementPipeline
     from .image_reid_person_pipeline import ImageReidPersonPipeline
     from .image_style_transfer_pipeline import ImageStyleTransferPipeline
@@ -37,6 +38,7 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -59,6 +61,8 @@ else:
         'image_instance_segmentation_pipeline':
         ['ImageInstanceSegmentationPipeline'],
         'image_matting_pipeline': ['ImageMattingPipeline'],
+        'image_panoptic_segmentation_pipeline':
+        ['ImagePanopticSegmentationPipeline'],
         'image_portrait_enhancement_pipeline':
         ['ImagePortraitEnhancementPipeline'],
         'image_reid_person_pipeline': ['ImageReidPersonPipeline'],
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
new file mode 100644
index 00000000..9ffc2b03
--- /dev/null
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_panoptic_segmentation)
+class ImagePanopticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image panoptic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('panoptic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            # input is str, file names, pipeline loadimagefromfile
+            # collect data
+            data = dict(img_info=dict(filename=input), img_prefix=None)
+        elif isinstance(input, PIL.Image.Image):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input.convert('RGB'))
+            # collect data
+            data = dict(img=img)
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+            img = img[:, :, ::-1]  # in rgb order
+            # collect data
+            data = dict(img=img)
+
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # bz=1, tcguo
+        pan_results = inputs[0]['pan_results']
+        INSTANCE_OFFSET = 1000
+
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.model.CLASSES)[labels].tolist()
+
+        outputs = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return outputs
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index da8de672..fca0e54f 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -134,3 +134,22 @@ def show_video_tracking_result(video_in_path, bboxes, video_save_path):
         video_writer.write(frame)
     video_writer.release
     cap.release()
+
+
+def panoptic_seg_masks_to_image(masks):
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    from mmdet.core.visualization.image import _get_bias_color
+    taken_colors = set([0, 0, 0])
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        while tuple(color_mask) in taken_colors:
+            color_mask = _get_bias_color(color_mask)
+        taken_colors.add(tuple(color_mask))
+
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+
+    return draw_img
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
new file mode 100644
index 00000000..3f07adf5
--- /dev/null
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -0,0 +1,40 @@
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.test_utils import test_level
+
+
+class ImagePanopticSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        result = pan_segmentor(input_location)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation return success')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation_from_PIL(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        PIL_array = PIL.Image.open(input_location)
+        result = pan_segmentor(PIL_array)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation from PIL return success')
+
+
+if __name__ == '__main__':
+    unittest.main()

From c72e5f4ae8bddc4b83e9c1cd7d937340c10e987d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 24 Aug 2022 15:08:22 +0800
Subject: [PATCH 002/175] [to #43878347] skip device placement test

skip this test which will result in too much debug log for placement although debug level is canceled after this test case

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9875987
---
 tests/utils/test_device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/utils/test_device.py b/tests/utils/test_device.py
index 3135b214..4def9915 100644
--- a/tests/utils/test_device.py
+++ b/tests/utils/test_device.py
@@ -81,6 +81,7 @@ class DeviceTest(unittest.TestCase):
         with device_placement(Frameworks.torch, 'cpu'):
             pass
 
+    @unittest.skip('skip this test to avoid debug logging.')
     def test_device_placement_tf_gpu(self):
         tf.debugging.set_log_device_placement(True)
         with device_placement(Frameworks.tf, 'gpu:0'):

From 7da07a8370363743bf57f70a814e3e2afb59e938 Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Wed, 24 Aug 2022 15:30:38 +0800
Subject: [PATCH 003/175] =?UTF-8?q?[to=20#42322933]ocr=5Fdetection=20pipel?=
 =?UTF-8?q?ine=20jupyter=E7=8E=AF=E5=A2=83bug=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ocr_detection pipeline jupyter环境优化
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9876662
---
 modelscope/pipelines/cv/ocr_detection_pipeline.py | 5 +++++
 modelscope/pipelines/cv/ocr_utils/ops.py          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index b54ad96d..62248714 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -17,6 +17,11 @@ from .ocr_utils import (SegLinkDetector, cal_width, combine_segments_python,
                         decode_segments_links_python, nms_python,
                         rboxes_to_polygons)
 
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
+
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index 2bc8a8bf..eeab36a0 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -88,7 +88,7 @@ def _nn_variable(name, shape, init_method, collection=None, **kwargs):
     else:
         raise 'Unsupported weight initialization method: ' + init_method
 
-    var = tf.get_variable(name, shape=shape, initializer=initializer, **kwargs)
+    var = tf.get_variable(name, shape=shape, initializer=initializer)
     if collection is not None:
         tf.add_to_collection(collection, var)
 

From 427f0e83ea0d203deaca1b70142d6017639098d7 Mon Sep 17 00:00:00 2001
From: "menrui.mr" <menrui.mr@alibaba-inc.com>
Date: Wed, 24 Aug 2022 19:06:29 +0800
Subject: [PATCH 004/175] =?UTF-8?q?[to=20#42322933]ofa=E6=96=87=E7=94=9F?=
 =?UTF-8?q?=E5=9B=BE=E6=8E=A5=E5=85=A5clip=20reranking=E5=90=8E=E5=A4=84?=
 =?UTF-8?q?=E7=90=86=20&=20=E4=BF=AE=E5=A4=8D=E9=A2=84=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=E4=B8=AD=E7=9A=84=E4=B8=80=E4=B8=AABug=20=20=20=20=20=20=20=20?=
 =?UTF-8?q?=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coder?=
 =?UTF-8?q?eview/9880918?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../ofa_for_text_to_image_synthesis_model.py  | 158 +++++++++++++++++-
 .../ofa/text_to_image_synthesis.py            |   3 +-
 2 files changed, 155 insertions(+), 6 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index 5cdc9668..b942e3fa 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -6,17 +6,30 @@ import numpy as np
 import torch
 import torch.cuda
 from PIL import Image
+from pkg_resources import packaging
 from taming.models.vqgan import GumbelVQ, VQModel
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.mmr.models.module_clip import CLIP
+from modelscope.models.multi_modal.mmr.models.tokenization_clip import \
+    SimpleTokenizer as ClipTokenizer
 from modelscope.models.multi_modal.ofa import OFAModel, OFATokenizer
 from modelscope.models.multi_modal.ofa.generate import sequence_generator as sg
 from modelscope.models.multi_modal.ofa.generate.search import Sampling
 from modelscope.models.multi_modal.ofa.generate.utils import move_to_device
 from modelscope.utils.constant import Tasks
 
+try:
+    from torchvision.transforms import InterpolationMode
+
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
 __all__ = ['OfaForTextToImageSynthesis']
 
 
@@ -43,6 +56,74 @@ def load_vqgan(config, ckpt_path=None, is_gumbel=False):
     return model.eval()
 
 
+def build_clip_model(model_path):
+    state_dict = torch.load(model_path, map_location='cpu').state_dict()
+    vit = 'visual.proj' in state_dict
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split('.')[2] for k in state_dict
+                    if k.startswith(f'visual.layer{b}')))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0]
+             - 1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split('.')[2] for k in state_dict
+            if k.startswith('transformer.resblocks')))
+
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        if key in state_dict:
+            del state_dict[key]
+
+    model.load_state_dict(state_dict)
+    return model.eval()
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def build_clip_transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
 @MODELS.register_module(Tasks.text_to_image_synthesis, module_name=Models.ofa)
 class OfaForTextToImageSynthesis(Model):
 
@@ -65,11 +146,23 @@ class OfaForTextToImageSynthesis(Model):
             vqgan_config,
             ckpt_path=os.path.join(model_dir, 'vqgan_model.ckpt'),
             is_gumbel=True).to(self._device)
+
+        # Initialize OpenAI clip
+
+        self.clip_tokenizer = ClipTokenizer(model_dir)
+        self.clip_model = build_clip_model(
+            os.path.join(model_dir, 'ViT-B-16.pt'))
+        self.clip_preprocess = build_clip_transform(
+            self.clip_model.visual.input_resolution)
+
+        self.clip_model.to(self._device)
+        self.clip_model.eval()
+
         # Initialize generator
         sampling = Sampling(self.tokenizer, sampling_topp=0.9)
         sg_args = {
             'tokenizer': self.tokenizer,
-            'beam_size': 1,
+            'beam_size': 2,
             'max_len_b': 1024,
             'min_len': 1024,
             'search_strategy': sampling,
@@ -78,13 +171,68 @@ class OfaForTextToImageSynthesis(Model):
         }
         self.generator = sg.SequenceGenerator(**sg_args)
 
+    def clip_tokenize(self, texts, context_length=77, truncate=False):
+
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.clip_tokenizer.encoder['<|startoftext|>']
+        eot_token = self.clip_tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.clip_tokenizer.encode(text)
+                      + [eot_token] for text in texts]
+        if packaging.version.parse(
+                torch.__version__) < packaging.version.parse('1.8.0'):
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.long)
+        else:
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.int)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate:
+                    tokens = tokens[:context_length]
+                    tokens[-1] = eot_token
+                else:
+                    raise RuntimeError(
+                        f'Input {texts[i]} is too long for context length {context_length}'
+                    )
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
     def forward(self, input: Dict[str, Any]):
+
+        text = input['samples'][0]['text']
         input = move_to_device(input, self._device)
+        clip_text_input = self.clip_tokenize([text]).to(self._device)
+
         gen_output = self.generator.generate([self.model], input)
-        gen_tokens = gen_output[0][0]['tokens'][:-1]
-        codes = gen_tokens.view(1, 32, 32) - 50265
+        gen_tokens = torch.stack(
+            [item['tokens'][:-1] for item in gen_output[0]], dim=0)
+        codes = gen_tokens.view(-1, 32, 32) - 50265
+
         quant_b = self.vqgan_model.quantize.get_codebook_entry(
             codes.view(-1),
             list(codes.size()) + [self.vqgan_model.quantize.embedding_dim])
-        dec = self.vqgan_model.decode(quant_b)[0]
-        return custom_to_pil(dec)
+        imgs = self.vqgan_model.decode(quant_b)
+
+        sample_num = imgs.size()[0]
+        pil_imgs = [custom_to_pil(imgs[i]) for i in range(sample_num)]
+
+        clip_image_input = torch.stack(
+            [self.clip_preprocess(img) for img in pil_imgs],
+            dim=0).to(self._device)
+
+        with torch.no_grad():
+            hyp_image_features = self.clip_model.encode_image(clip_image_input)
+            hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
+            text_features = self.clip_model.encode_text(clip_text_input)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+        ti_similarity = hyp_image_features @ text_features.T
+
+        sorted_score, ti_indices = torch.sort(
+            ti_similarity.view(-1), descending=True)
+
+        pil_imgs_orderby_ti = [pil_imgs[index] for index in ti_indices]
+        return pil_imgs_orderby_ti[0]
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 938f50de..e10de82c 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -19,7 +19,8 @@ class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        source = data['text'].lower().strip().split()[:self.max_src_length]
+        source = ' '.join(
+            data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
         inputs = self.get_inputs(source)
         sample = {

From b94bb74f665c630e129c0a6fb3e662d57207b9e3 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Wed, 24 Aug 2022 21:39:08 +0800
Subject: [PATCH 005/175] [to #42322933]Add model.save_pretrained method and
 allow finetune results used by pipeline

---
 modelscope/fileio/__init__.py                 |  2 +-
 modelscope/fileio/file.py                     |  2 +-
 modelscope/models/base/base_model.py          | 32 ++++++-
 modelscope/trainers/hooks/checkpoint_hook.py  | 17 +++-
 modelscope/utils/checkpoint.py                | 85 ++++++++++++++++++-
 modelscope/utils/config.py                    | 18 ++++
 modelscope/utils/constant.py                  |  1 +
 modelscope/utils/hub.py                       | 12 ++-
 .../hooks/logger/test_tensorboard_hook.py     |  3 +-
 tests/trainers/hooks/test_checkpoint_hook.py  | 22 ++++-
 tests/trainers/hooks/test_evaluation_hook.py  |  3 +-
 .../trainers/hooks/test_lr_scheduler_hook.py  |  3 +-
 tests/trainers/hooks/test_optimizer_hook.py   |  3 +-
 tests/trainers/hooks/test_timer_hook.py       |  5 +-
 .../test_finetune_sequence_classification.py  | 37 ++++++--
 tests/trainers/test_trainer.py                |  3 +-
 tests/trainers/test_trainer_gpu.py            |  3 +-
 tests/trainers/test_trainer_with_nlp.py       | 29 ++++++-
 tests/trainers/utils/test_inference.py        |  3 +-
 19 files changed, 254 insertions(+), 29 deletions(-)

diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index 5fd10f85..b526d593 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,2 +1,2 @@
-from .file import File
+from .file import File, LocalStorage
 from .io import dump, dumps, load
diff --git a/modelscope/fileio/file.py b/modelscope/fileio/file.py
index 343cad9a..3fff80c8 100644
--- a/modelscope/fileio/file.py
+++ b/modelscope/fileio/file.py
@@ -240,7 +240,7 @@ class File(object):
     @staticmethod
     def _get_storage(uri):
         assert isinstance(uri,
-                          str), f'uri should be str type, buf got {type(uri)}'
+                          str), f'uri should be str type, but got {type(uri)}'
 
         if '://' not in uri:
             # local path
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 279dbba2..872c42e8 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -1,13 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Union
-
-import numpy as np
+from typing import Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
+from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
@@ -119,3 +118,28 @@ class Model(ABC):
         if hasattr(cfg, 'pipeline'):
             model.pipeline = cfg.pipeline
         return model
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = None,
+                        config: Optional[dict] = None,
+                        **kwargs):
+        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            save_checkpoint_names (Union[str, List[str]]):
+            The checkpoint names to be saved in the target_folder
+
+            save_function (Callable, optional):
+            The function to use to save the state dictionary.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json, might not be identical with model.config
+
+        """
+        save_pretrained(self, target_folder, save_checkpoint_names,
+                        save_function, config, **kwargs)
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index fc0281a1..623d4654 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 
+import json
+
 from modelscope import __version__
 from modelscope.metainfo import Hooks
 from modelscope.utils.checkpoint import save_checkpoint
-from modelscope.utils.constant import LogKeys
+from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import is_master
 from .builder import HOOKS
@@ -73,6 +75,18 @@ class CheckpointHook(Hook):
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        self._save_pretrained(trainer)
+
+    def _save_pretrained(self, trainer):
+        if self.is_last_epoch(trainer) and self.by_epoch:
+            output_dir = os.path.join(self.save_dir,
+                                      ModelFile.TRAIN_OUTPUT_DIR)
+
+            trainer.model.save_pretrained(
+                output_dir,
+                ModelFile.TORCH_MODEL_BIN_FILE,
+                save_function=save_checkpoint,
+                config=trainer.cfg.to_dict())
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -166,3 +180,4 @@ class BestCkptSaverHook(CheckpointHook):
             )
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
         self._best_ckpt_file = cur_save_name
+        self._save_pretrained(trainer)
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 76fb2a19..8b9d027a 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -1,15 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import io
+import os
 import time
 from collections import OrderedDict
-from typing import Optional
+from shutil import copytree, ignore_patterns, rmtree
+from typing import Callable, List, Optional, Union
 
+import json
+import numpy as np
 import torch
 from torch.optim import Optimizer
 
 from modelscope import __version__
-from modelscope.fileio import File
+from modelscope.fileio import File, LocalStorage
+from modelscope.utils.config import JSONIteratorEncoder
+from modelscope.utils.constant import ConfigFields, ModelFile
+
+storage = LocalStorage()
 
 
 def weights_to_cpu(state_dict):
@@ -72,3 +80,76 @@ def save_checkpoint(model: torch.nn.Module,
     with io.BytesIO() as f:
         torch.save(checkpoint, f)
         File.write(f.getvalue(), filename)
+
+
+def save_pretrained(model,
+                    target_folder: Union[str, os.PathLike],
+                    save_checkpoint_name: str = None,
+                    save_function: Callable = None,
+                    config: Optional[dict] = None,
+                    **kwargs):
+    """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+    Args:
+        model (Model): Model whose params are to be saved.
+
+        target_folder (Union[str, os.PathLike]):
+        Directory to which to save. Will be created if it doesn't exist.
+
+        save_checkpoint_name (str):
+        The checkpoint name to be saved in the target_folder
+
+        save_function (Callable, optional):
+        The function to use to save the state dictionary.
+
+        config (Optional[dict], optional):
+        The config for the configuration.json, might not be identical with model.config
+    """
+
+    if save_function is None or not isinstance(save_function, Callable):
+        raise Exception('A valid save function must be passed in')
+
+    if target_folder is None or os.path.isfile(target_folder):
+        raise ValueError(
+            f'Provided path ({target_folder}) should be a directory, not a file'
+        )
+
+    if save_checkpoint_name is None:
+        raise Exception(
+            'At least pass in one checkpoint name for saving method')
+
+    if config is None:
+        raise ValueError('Configuration is not valid')
+
+    # Clean the folder from a previous save
+    if os.path.exists(target_folder):
+        rmtree(target_folder)
+
+    # Single ckpt path, sharded ckpt logic will be added later
+    output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
+
+    # Save the files to be copied to the save directory, ignore the original ckpts and configuration
+    origin_file_to_be_ignored = [save_checkpoint_name]
+    ignore_file_set = set(origin_file_to_be_ignored)
+    ignore_file_set.add(ModelFile.CONFIGURATION)
+    ignore_file_set.add('.*')
+    if hasattr(model, 'model_dir') and model.model_dir is not None:
+        copytree(
+            model.model_dir,
+            target_folder,
+            ignore=ignore_patterns(*ignore_file_set))
+
+    # Save the ckpt to the save directory
+    try:
+        save_function(model, output_ckpt_path)
+    except Exception as e:
+        raise Exception(
+            f'During saving checkpoints, the error of "{type(e).__name__} '
+            f'with msg {e} throwed')
+
+    # Dump the config to the configuration.json
+    if ConfigFields.pipeline not in config:
+        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
+    cfg_str = json.dumps(config, cls=JSONIteratorEncoder)
+    config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
+    storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index a28ac1ab..42985db6 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Dict, Union
 
 import addict
+import json
 from yapf.yapflib.yapf_api import FormatCode
 
 from modelscope.utils.constant import ConfigFields, ModelFile
@@ -627,3 +628,20 @@ def check_config(cfg: Union[str, ConfigDict]):
         check_attr(ConfigFields.model)
         check_attr(ConfigFields.preprocessor)
         check_attr(ConfigFields.evaluation)
+
+
+class JSONIteratorEncoder(json.JSONEncoder):
+    """Implement this method in order that supporting arbitrary iterators, it returns
+        a serializable object for ``obj``, or calls the base implementation
+        (to raise a ``TypeError``).
+
+    """
+
+    def default(self, obj):
+        try:
+            iterable = iter(obj)
+        except TypeError:
+            pass
+        else:
+            return list(iterable)
+        return json.JSONEncoder.default(self, obj)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d914767b..81712983 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -211,6 +211,7 @@ class ModelFile(object):
     VOCAB_FILE = 'vocab.txt'
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
+    TRAIN_OUTPUT_DIR = 'output'
 
 
 class ConfigFields(object):
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 6d685b87..f79097fe 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -10,7 +10,8 @@ from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.utils.config import Config
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ModelFile)
 from .logger import get_logger
 
 logger = get_logger(__name__)
@@ -119,8 +120,13 @@ def parse_label_mapping(model_dir):
     if label2id is None:
         config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
         config = Config.from_file(config_path)
-        if hasattr(config, 'model') and hasattr(config.model, 'label2id'):
-            label2id = config.model.label2id
+        if hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'label2id'):
+            label2id = config[ConfigFields.model].label2id
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'label2id'):
+            label2id = config[ConfigFields.preprocessor].label2id
+
     if label2id is None:
         config_path = os.path.join(model_dir, 'config.json')
         config = Config.from_file(config_path)
diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py
index 54c31056..67b1aa63 100644
--- a/tests/trainers/hooks/logger/test_tensorboard_hook.py
+++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py
@@ -11,6 +11,7 @@ import torch
 from torch import nn
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -19,7 +20,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index 1c81d057..c694ece6 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -11,11 +11,14 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.registry import default_group
 from modelscope.utils.test_utils import create_dummy_test_dataset
 
+SRC_DIR = os.path.dirname(__file__)
+
 
 def create_dummy_metric():
     _global_iter = 0
@@ -39,12 +42,13 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
         self.linear = nn.Linear(5, 4)
         self.bn = nn.BatchNorm1d(4)
+        self.model_dir = SRC_DIR
 
     def forward(self, feat, labels):
         x = self.linear(feat)
@@ -123,6 +127,14 @@ class CheckpointHookTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 class BestCkptSaverHookTest(unittest.TestCase):
 
@@ -198,6 +210,14 @@ class BestCkptSaverHookTest(unittest.TestCase):
         self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth',
                       results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py
index 1338bb2c..2c71e790 100644
--- a/tests/trainers/hooks/test_evaluation_hook.py
+++ b/tests/trainers/hooks/test_evaluation_hook.py
@@ -11,6 +11,7 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.registry import default_group
@@ -34,7 +35,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index 86d53ecc..7a1ff220 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -13,6 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.registry import default_group
@@ -40,7 +41,7 @@ def create_dummy_metric():
             return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]}
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py
index 25457c1c..84c783b5 100644
--- a/tests/trainers/hooks/test_optimizer_hook.py
+++ b/tests/trainers/hooks/test_optimizer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py
index 614f7688..9fb79c77 100644
--- a/tests/trainers/hooks/test_timer_hook.py
+++ b/tests/trainers/hooks/test_timer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -83,8 +84,8 @@ class IterTimerHookTest(unittest.TestCase):
             trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
-        trainer.data_loader = train_dataloader
         trainer.train_dataloader = train_dataloader
+        trainer.data_loader = train_dataloader
         trainer.invoke_hook(TrainerStages.before_run)
         for i in range(trainer._epoch, trainer._max_epochs):
             trainer.invoke_hook(TrainerStages.before_train_epoch)
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 12c7da77..847e47ef 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -4,11 +4,18 @@ import shutil
 import tempfile
 import unittest
 
-from modelscope.metainfo import Trainers
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
+    epoch_num = 1
+
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -40,15 +47,32 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(10):
+        for i in range(self.epoch_num):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+    def pipeline_sentence_similarity(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.sentence_similarity, model=model)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
     @unittest.skip
     def test_finetune_afqmc(self):
 
         def cfg_modify_fn(cfg):
-            cfg.task = 'sentence-similarity'
-            cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'}
+            cfg.task = Tasks.sentence_similarity
+            cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer}
             cfg.train.optimizer.lr = 2e-5
             cfg['dataset'] = {
                 'train': {
@@ -58,7 +82,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'label': 'label',
                 }
             }
-            cfg.train.max_epochs = 10
+            cfg.train.max_epochs = self.epoch_num
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,
@@ -95,6 +119,9 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_sentence_similarity(output_dir)
+
     @unittest.skip
     def test_finetune_tnews(self):
 
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 0259f804..be29844d 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -14,6 +14,7 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
@@ -35,7 +36,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 9781816d..3777772d 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -15,6 +15,7 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import (DistributedTestCase,
@@ -37,7 +38,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 213b6b4f..2cf1c152 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -6,16 +6,20 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
+from modelscope.models.base import Model
 from modelscope.models.nlp.sequence_classification import \
     SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 
 class TestTrainerWithNlp(unittest.TestCase):
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -30,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         kwargs = dict(
@@ -47,6 +51,27 @@ class TestTrainerWithNlp(unittest.TestCase):
         for i in range(10):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+        def pipeline_sentence_similarity(model_dir):
+            model = Model.from_pretrained(model_dir)
+            pipeline_ins = pipeline(
+                task=Tasks.sentence_similarity, model=model)
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        pipeline_sentence_similarity(output_dir)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_backbone_head(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py
index 87e5320e..23561734 100644
--- a/tests/trainers/utils/test_inference.py
+++ b/tests/trainers/utils/test_inference.py
@@ -11,6 +11,7 @@ from torch.utils.data import DataLoader
 from modelscope.metrics.builder import MetricKeys
 from modelscope.metrics.sequence_classification_metric import \
     SequenceClassificationMetric
+from modelscope.models.base import Model
 from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     torch.rand((5, )), torch.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()

From 0b6725add658f933a5c6ad9e47c9f6a54bd55435 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Thu, 25 Aug 2022 15:50:24 +0800
Subject: [PATCH 006/175] =?UTF-8?q?[to=20#42322933]semantic=20segmentation?=
 =?UTF-8?q?=20=E6=A8=A1=E5=9E=8B=E6=8E=A5=E5=85=A5=20=20=20=20=20=20=20=20?=
 =?UTF-8?q?=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coder?=
 =?UTF-8?q?eview/9851374?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../images/image_semantic_segmentation.jpg    |   3 +
 modelscope/metainfo.py                        |   3 +
 modelscope/models/cv/__init__.py              |  10 +-
 .../image_semantic_segmentation/__init__.py   |  22 +
 .../pan_merge/__init__.py                     |   1 +
 .../pan_merge/base_panoptic_fusion_head.py    |  47 ++
 .../pan_merge/maskformer_semantic_head.py     |  57 ++
 .../semantic_seg_model.py                     |  76 +++
 .../vit_adapter/__init__.py                   |   3 +
 .../vit_adapter/models/__init__.py            |   3 +
 .../vit_adapter/models/backbone/__init__.py   |   4 +
 .../models/backbone/adapter_modules.py        | 523 ++++++++++++++++
 .../models/backbone/base/__init__.py          |   3 +
 .../vit_adapter/models/backbone/base/beit.py  | 476 ++++++++++++++
 .../models/backbone/beit_adapter.py           | 169 +++++
 .../models/decode_heads/__init__.py           |   3 +
 .../models/decode_heads/base_decode_head.py   | 267 ++++++++
 .../mask2former_head_from_mmseg.py            | 581 ++++++++++++++++++
 .../vit_adapter/models/segmentors/__init__.py |   3 +
 .../models/segmentors/base_segmentor.py       | 314 ++++++++++
 .../segmentors/encoder_decoder_mask2former.py | 303 +++++++++
 .../vit_adapter/utils/__init__.py             |   7 +
 .../vit_adapter/utils/builder.py              |  11 +
 .../vit_adapter/utils/data_process_func.py    |  60 ++
 .../vit_adapter/utils/seg_func.py             |  48 ++
 modelscope/pipelines/cv/__init__.py           |   3 +
 .../image_semantic_segmentation_pipeline.py   |  95 +++
 modelscope/utils/cv/image_utils.py            |  13 +
 .../test_image_semantic_segmentation.py       |  54 ++
 29 files changed, 3157 insertions(+), 5 deletions(-)
 create mode 100644 data/test/images/image_semantic_segmentation.jpg
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
 create mode 100644 modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
 create mode 100644 tests/pipelines/test_image_semantic_segmentation.py

diff --git a/data/test/images/image_semantic_segmentation.jpg b/data/test/images/image_semantic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_semantic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1fba50b3..8e21c00b 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -23,6 +23,8 @@ class Models(object):
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
     video_summarization = 'pgl-video-summarization'
+    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
+    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
 
     # nlp models
     bert = 'bert'
@@ -117,6 +119,7 @@ class Pipelines(object):
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
     image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
+    image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
 
     # nlp tasks
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 3af7a1b6..227be2c7 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -4,8 +4,8 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                face_generation, image_classification, image_color_enhance,
                image_colorization, image_denoise, image_instance_segmentation,
                image_panoptic_segmentation, image_portrait_enhancement,
-               image_reid_person, image_to_image_generation,
-               image_to_image_translation, object_detection,
-               product_retrieval_embedding, salient_detection,
-               super_resolution, video_single_object_tracking,
-               video_summarization, virual_tryon)
+               image_reid_person, image_semantic_segmentation,
+               image_to_image_generation, image_to_image_translation,
+               object_detection, product_retrieval_embedding,
+               salient_detection, super_resolution,
+               video_single_object_tracking, video_summarization, virual_tryon)
diff --git a/modelscope/models/cv/image_semantic_segmentation/__init__.py b/modelscope/models/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..598d7c21
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .semantic_seg_model import SemanticSegmentation
+
+else:
+    _import_structure = {
+        'semantic_seg_model': ['SemanticSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
new file mode 100644
index 00000000..2a75f318
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
@@ -0,0 +1 @@
+from .maskformer_semantic_head import MaskFormerSemanticHead
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
new file mode 100644
index 00000000..05e68d89
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+from mmdet.models.builder import build_loss
+
+
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(BasePanopticFusionHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = build_loss(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self):
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
+        """Forward function during training."""
+
+    @abstractmethod
+    def simple_test(self,
+                    img_metas,
+                    det_labels,
+                    mask_preds,
+                    seg_preds,
+                    det_bboxes,
+                    cfg=None,
+                    **kwargs):
+        """Test without augmentation."""
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
new file mode 100644
index 00000000..6769ebaf
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn.functional as F
+from mmdet.models.builder import HEADS
+
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class MaskFormerSemanticHead(BasePanopticFusionHead):
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
+                         loss_panoptic, init_cfg, **kwargs)
+
+    def forward_train(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def simple_test(self,
+                    mask_cls_results,
+                    mask_pred_results,
+                    img_metas,
+                    rescale=False,
+                    **kwargs):
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            # semantic inference
+            cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1]
+            mask_pred = mask_pred_result.sigmoid()
+            seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred)
+            # still need softmax and argmax
+            seg_logit = F.softmax(seg_mask, dim=0)
+            seg_pred = seg_logit.argmax(dim=0)
+            seg_pred = seg_pred.cpu().numpy()
+            results.append(seg_pred)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
new file mode 100644
index 00000000..60acf28f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -0,0 +1,76 @@
+import os.path as osp
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_semantic_segmentation import (pan_merge,
+                                                              vit_adapter)
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation)
+@MODELS.register_module(
+    Tasks.image_segmentation,
+    module_name=Models.vitadapter_semantic_segmentation)
+class SemanticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'mmcv_config.py')
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        _ = load_checkpoint(self.model, model_path, map_location='cpu')
+
+        self.CLASSES = cfg['CLASSES']  # list
+        self.PALETTE = cfg['PALETTE']  # list
+
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def forward(self, Inputs):
+        return self.model(**Inputs)
+
+    def postprocess(self, Inputs):
+        semantic_result = Inputs[0]
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
+
+    def inference(self, data):
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
new file mode 100644
index 00000000..82eec1c6
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
@@ -0,0 +1,3 @@
+from .models import backbone, decode_heads, segmentors
+from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
+                    seg_resize)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
new file mode 100644
index 00000000..ae5c5acf
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
@@ -0,0 +1,3 @@
+from .backbone import BASEBEiT, BEiTAdapter
+from .decode_heads import Mask2FormerHeadFromMMSeg
+from .segmentors import EncoderDecoderMask2Former
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
new file mode 100644
index 00000000..ab4258c1
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
@@ -0,0 +1,4 @@
+from .base import BASEBEiT
+from .beit_adapter import BEiTAdapter
+
+__all__ = ['BEiTAdapter', 'BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
new file mode 100644
index 00000000..03080342
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -0,0 +1,523 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import logging
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath
+
+_logger = logging.getLogger(__name__)
+
+
+def get_reference_points(spatial_shapes, device):
+    reference_points_list = []
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        ref_y, ref_x = torch.meshgrid(
+            torch.linspace(
+                0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+            torch.linspace(
+                0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+        ref_y = ref_y.reshape(-1)[None] / H_
+        ref_x = ref_x.reshape(-1)[None] / W_
+        ref = torch.stack((ref_x, ref_y), -1)
+        reference_points_list.append(ref)
+    reference_points = torch.cat(reference_points_list, 1)
+    reference_points = reference_points[:, :, None]
+    return reference_points
+
+
+def deform_inputs(x):
+    bs, c, h, w = x.shape
+    spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16),
+                                      (h // 32, w // 32)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 16, w // 16)], x.device)
+    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]
+
+    spatial_shapes = torch.as_tensor([(h // 16, w // 16)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 8, w // 8),
+                                             (h // 16, w // 16),
+                                             (h // 32, w // 32)], x.device)
+    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]
+
+    return deform_inputs1, deform_inputs2
+
+
+class ConvFFN(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2,
+                                                    W * 2).contiguous()
+        x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H,
+                                                         W).contiguous()
+        x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2,
+                                                   W // 2).contiguous()
+        x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
+        x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
+        x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
+        x = torch.cat([x1, x2, x3], dim=1)
+        return x
+
+
+class Extractor(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 with_cp=False):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.with_cffn = with_cffn
+        self.with_cp = with_cp
+        if with_cffn:
+            self.ffn = ConvFFN(
+                in_features=dim,
+                hidden_features=int(dim * cffn_ratio),
+                drop=drop)
+            self.ffn_norm = norm_layer(dim)
+            self.drop_path = DropPath(
+                drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index, H, W):
+
+        def _inner_forward(query, feat):
+            attn = self.attn(
+                query=self.query_norm(query),
+                key=None,
+                value=self.feat_norm(feat),
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+
+            query = query + attn
+
+            if self.with_cffn:
+                query = query + self.drop_path(
+                    self.ffn(self.ffn_norm(query), H, W))
+            return query
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class Injector(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.gamma = nn.Parameter(
+            init_values * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index):
+
+        def _inner_forward(query, feat):
+            input_query = self.query_norm(query)
+            input_value = self.feat_norm(feat)
+            attn = self.attn(
+                query=input_query,
+                key=None,
+                value=input_value,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+            return query + self.gamma * attn
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class InteractionBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c
+
+
+class InteractionBlockWithCls(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        x = torch.cat((cls, x), dim=1)
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        cls, x = x[:, :1, ], x[:, 1:, ]
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c, cls
+
+
+class SpatialPriorModule(nn.Module):
+
+    def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+
+        self.stem = nn.Sequential(*[
+            nn.Conv2d(
+                3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        ])
+        self.conv2 = nn.Sequential(*[
+            nn.Conv2d(
+                inplanes,
+                2 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(2 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv3 = nn.Sequential(*[
+            nn.Conv2d(
+                2 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv4 = nn.Sequential(*[
+            nn.Conv2d(
+                4 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.fc1 = nn.Conv2d(
+            inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc2 = nn.Conv2d(
+            2 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc3 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc4 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            c1 = self.stem(x)
+            c2 = self.conv2(c1)
+            c3 = self.conv3(c2)
+            c4 = self.conv4(c3)
+            c1 = self.fc1(c1)
+            c2 = self.fc2(c2)
+            c3 = self.fc3(c3)
+            c4 = self.fc4(c4)
+
+            bs, dim, _, _ = c1.shape
+
+            c2 = c2.view(bs, dim, -1).transpose(1, 2)  # 8s
+            c3 = c3.view(bs, dim, -1).transpose(1, 2)  # 16s
+            c4 = c4.view(bs, dim, -1).transpose(1, 2)  # 32s
+
+            return c1, c2, c3, c4
+
+        if self.with_cp and x.requires_grad:
+            outs = cp.checkpoint(_inner_forward, x)
+        else:
+            outs = _inner_forward(x)
+        return outs
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
new file mode 100644
index 00000000..40b0fa89
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
@@ -0,0 +1,3 @@
+from .beit import BASEBEiT
+
+__all__ = ['BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
new file mode 100644
index 00000000..a5811fb9
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -0,0 +1,476 @@
+# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit
+# This implementation refers to
+# https://github.com/czczup/ViT-Adapter.git
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.runner import _load_checkpoint
+from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # commit dropout for the original BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0]
+                                          - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance,
+                            num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h,
+                                                 coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer('relative_position_index',
+                                 relative_position_index)
+
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, H, W, rel_pos_bias=None):
+
+        def _inner_forward(x):
+            if self.gamma_1 is None:
+                x = x + self.drop_path(
+                    self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+            else:
+                x = x + self.drop_path(self.gamma_1 * self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, Hp, Wp
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self,
+                 backbone,
+                 img_size=224,
+                 feature_size=None,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(
+                    torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0]
+                                      - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance,
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register_module()
+class BASEBEiT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=512,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=80,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=None,
+                 init_values=None,
+                 use_checkpoint=False,
+                 use_abs_pos_emb=False,
+                 use_rel_pos_bias=True,
+                 use_shared_rel_pos_bias=False,
+                 pretrained=None,
+                 with_cp=False):
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.norm_layer = norm_layer
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.drop_path_rate = drop_path_rate
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone,
+                img_size=img_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                with_cp=with_cp,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+            checkpoint = _load_checkpoint(
+                init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+
+    def fix_init_weight(self):
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
new file mode 100644
index 00000000..02a4968e
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -0,0 +1,169 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import logging
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.builder import BACKBONES
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath, trunc_normal_
+from torch.nn.init import normal_
+
+from .adapter_modules import InteractionBlockWithCls as InteractionBlock
+from .adapter_modules import SpatialPriorModule, deform_inputs
+from .base.beit import BASEBEiT
+
+_logger = logging.getLogger(__name__)
+
+
+@BACKBONES.register_module()
+class BEiTAdapter(BASEBEiT):
+
+    def __init__(self,
+                 pretrain_size=224,
+                 conv_inplane=64,
+                 n_points=4,
+                 deform_num_heads=6,
+                 init_values=0.,
+                 cffn_ratio=0.25,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 interaction_indexes=None,
+                 add_vit_feature=True,
+                 with_cp=False,
+                 *args,
+                 **kwargs):
+
+        super().__init__(
+            init_values=init_values, with_cp=with_cp, *args, **kwargs)
+
+        self.num_block = len(self.blocks)
+        self.pretrain_size = (pretrain_size, pretrain_size)
+        self.flags = [
+            i for i in range(-1, self.num_block, self.num_block // 4)
+        ][1:]
+        self.interaction_indexes = interaction_indexes
+        self.add_vit_feature = add_vit_feature
+        embed_dim = self.embed_dim
+
+        self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
+        self.spm = SpatialPriorModule(
+            inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
+        self.interactions = nn.Sequential(*[
+            InteractionBlock(
+                dim=embed_dim,
+                num_heads=deform_num_heads,
+                n_points=n_points,
+                init_values=init_values,
+                drop_path=self.drop_path_rate,
+                norm_layer=self.norm_layer,
+                with_cffn=with_cffn,
+                cffn_ratio=cffn_ratio,
+                deform_ratio=deform_ratio,
+                extra_extractor=True if i == len(interaction_indexes)
+                - 1 else False,
+                with_cp=with_cp) for i in range(len(interaction_indexes))
+        ])
+
+        self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
+        self.norm1 = nn.SyncBatchNorm(embed_dim)
+        self.norm2 = nn.SyncBatchNorm(embed_dim)
+        self.norm3 = nn.SyncBatchNorm(embed_dim)
+        self.norm4 = nn.SyncBatchNorm(embed_dim)
+
+        self.up.apply(self._init_weights)
+        self.spm.apply(self._init_weights)
+        self.interactions.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        normal_(self.level_embed)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16,
+                                      self.pretrain_size[1] // 16,
+                                      -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1)
+        return pos_embed
+
+    def _init_deform_weights(self, m):
+        if isinstance(m, MultiScaleDeformableAttention):
+            m.init_weights()
+
+    def _add_level_embed(self, c2, c3, c4):
+        c2 = c2 + self.level_embed[0]
+        c3 = c3 + self.level_embed[1]
+        c4 = c4 + self.level_embed[2]
+        return c2, c3, c4
+
+    def forward(self, x):
+        deform_inputs1, deform_inputs2 = deform_inputs(x)
+
+        # SPM forward
+        c1, c2, c3, c4 = self.spm(x)
+        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
+        c = torch.cat([c2, c3, c4], dim=1)
+
+        # Patch Embedding forward
+        x, H, W = self.patch_embed(x)
+        bs, n, dim = x.shape
+        cls = self.cls_token.expand(
+            bs, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+
+        if self.pos_embed is not None:
+            pos_embed = self._get_pos_embed(self.pos_embed, H, W)
+            x = x + pos_embed
+        x = self.pos_drop(x)
+
+        # Interaction
+        outs = list()
+        for i, layer in enumerate(self.interactions):
+            indexes = self.interaction_indexes[i]
+            x, c, cls = layer(x, c, cls,
+                              self.blocks[indexes[0]:indexes[-1] + 1],
+                              deform_inputs1, deform_inputs2, H, W)
+            outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous())
+
+        # Split & Reshape
+        c2 = c[:, 0:c2.size(1), :]
+        c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :]
+        c4 = c[:, c2.size(1) + c3.size(1):, :]
+
+        c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous()
+        c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous()
+        c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous()
+        c1 = self.up(c2) + c1
+
+        if self.add_vit_feature:
+            x1, x2, x3, x4 = outs
+            x1 = F.interpolate(
+                x1, scale_factor=4, mode='bilinear', align_corners=False)
+            x2 = F.interpolate(
+                x2, scale_factor=2, mode='bilinear', align_corners=False)
+            x4 = F.interpolate(
+                x4, scale_factor=0.5, mode='bilinear', align_corners=False)
+            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4
+
+        # Final Norm
+        f1 = self.norm1(c1)
+        f2 = self.norm2(c2)
+        f3 = self.norm3(c3)
+        f4 = self.norm4(c4)
+        return [f1, f2, f3, f4]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
new file mode 100644
index 00000000..9367806f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
@@ -0,0 +1,3 @@
+from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg
+
+__all__ = ['Mask2FormerHeadFromMMSeg']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
new file mode 100644
index 00000000..36660520
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -0,0 +1,267 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+from mmdet.models.builder import build_loss
+from mmdet.models.losses import accuracy
+
+from ...utils import build_pixel_sampler, seg_resize
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super(BaseDecodeHead, self).__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                seg_resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    @force_fp32(apply_to=('seg_logit', ))
+    def losses(self, seg_logit, seg_label):
+        """Compute segmentation loss."""
+        loss = dict()
+        seg_logit = seg_resize(
+            input=seg_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logit, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
new file mode 100644
index 00000000..ad8b1586
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -0,0 +1,581 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList, force_fp32
+from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+
+from .base_decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class Mask2FormerHeadFromMMSeg(BaseDecodeHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 num_transformer_feat_level=3,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Mask2FormerHeadFromMMSeg, self).__init__(
+            in_channels=in_channels,
+            channels=feat_channels,
+            num_classes=(num_things_classes + num_stuff_classes),
+            init_cfg=init_cfg,
+            input_transform='multiple_select',
+            **kwargs)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers. \
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+        self.conv_seg = None  # fix a bug here (conv_seg is not used)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape [num_queries,
+                cls_out_channels].
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape [num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.
+                    Each with shape [num_queries, ].
+                - label_weights_list (list[Tensor]): Label weights of all
+                    images.Each with shape [num_queries, ].
+                - mask_targets_list (list[Tensor]): Mask targets of all images.
+                    Each with shape [num_queries, h, w].
+                - mask_weights_list (list[Tensor]): Mask weights of all images.
+                    Each with shape [num_queries, ].
+                - num_total_pos (int): Number of positive samples in all
+                    images.
+                - num_total_neg (int): Number of negative samples in all
+                    images.
+        """
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      mask_preds_list, gt_labels_list,
+                                      gt_masks_list, img_metas)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list,
+                mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred,
+                                             gt_labels, gt_points_masks,
+                                             img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1, 1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
+             gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape [num_decoder, batch_size, num_queries,
+                cls_out_channels].
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape [num_decoder, batch_size, num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds,
+            all_gt_labels_list, all_gt_masks_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels,
+                      gt_masks):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Multi-level features from the upstream network,
+                each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_semantic_seg (list[tensor]):Each element is the ground truth
+                of semantic segmentation with the shape (N, H, W).
+            train_cfg (dict): The training config, which not been used in
+                maskformer.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+
+        Returns:
+            losses (dict[str, Tensor]): a dictionary of loss components
+        """
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, img_metas)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
+                           img_metas)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            inputs (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            test_cfg (dict): Testing config.
+
+        Returns:
+            seg_mask (Tensor): Predicted semantic segmentation logits.
+        """
+        all_cls_scores, all_mask_preds = self(inputs, img_metas)
+        cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
+        ori_h, ori_w, _ = img_metas[0]['ori_shape']
+
+        # semantic inference
+        cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_mask
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
new file mode 100644
index 00000000..1f2c8b04
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
@@ -0,0 +1,3 @@
+from .encoder_decoder_mask2former import EncoderDecoderMask2Former
+
+__all__ = ['EncoderDecoderMask2Former']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
new file mode 100644
index 00000000..8bd8fa3f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -0,0 +1,314 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseSegmentor(BaseModule, metaclass=ABCMeta):
+    """Base class for segmentors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseSegmentor, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got '
+                                f'{type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) != '
+                             f'num of image meta ({len(img_metas)})')
+
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        for img_meta in img_metas:
+            ori_shapes = [_['ori_shape'] for _ in img_meta]
+            if isinstance(ori_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0])
+                    for shape in ori_shapes)
+            else:
+                assert all(shape == ori_shapes[0] for shape in ori_shapes)
+
+            img_shapes = [_['img_shape'] for _ in img_meta]
+            if isinstance(img_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0])
+                    for shape in img_shapes)
+            else:
+                assert all(shape == img_shapes[0] for shape in img_shapes)
+
+            pad_shapes = [_['pad_shape'] for _ in img_meta]
+            if isinstance(pad_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0])
+                    for shape in pad_shapes)
+            else:
+                assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer=None, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        log_vars_ = dict()
+        for loss_name, loss_value in log_vars.items():
+            k = loss_name + '_val'
+            log_vars_[k] = loss_value
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars_,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        # If the loss_vars has different length, raise assertion error
+        # to prevent GPUs from infinite waiting.
+        if dist.is_available() and dist.is_initialized():
+            log_var_length = torch.tensor(len(log_vars), device=loss.device)
+            dist.all_reduce(log_var_length)
+            message = (f'rank {dist.get_rank()}'
+                       + f' len(log_vars): {len(log_vars)}' + ' keys: '
+                       + ','.join(log_vars.keys()) + '\n')
+            assert log_var_length == len(log_vars) * dist.get_world_size(), \
+                'loss log variables are different across GPUs!\n' + message
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return img
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
new file mode 100644
index 00000000..9287e8aa
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -0,0 +1,303 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import builder
+from mmdet.models.builder import DETECTORS
+
+from ...utils import add_prefix, seg_resize
+from .base_segmentor import BaseSegmentor
+
+
+@DETECTORS.register_module()
+class EncoderDecoderMask2Former(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(EncoderDecoderMask2Former, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        decode_head.update(train_cfg=train_cfg)
+        decode_head.update(test_cfg=test_cfg)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(builder.build_head(head_cfg))
+            else:
+                self.auxiliary_head = builder.build_head(auxiliary_head)
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        out = seg_resize(
+            input=out,
+            size=img.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg,
+                                   **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas,
+                                                     gt_semantic_seg, **kwargs)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas,
+                                                  gt_semantic_seg,
+                                                  self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(
+                x, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(img, None)
+
+        return seg_logit
+
+    def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas,
+                                                      gt_semantic_seg,
+                                                      **kwargs)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, img_metas, gt_semantic_seg)
+            losses.update(loss_aux)
+
+        return losses
+
+    # TODO refactor
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        num_classes = self.num_classes
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                crop_seg_logit = self.encode_decode(crop_img, img_meta)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(
+                count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if rescale:
+            preds = seg_resize(
+                preds,
+                size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2]
+                if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else
+                img_meta[0]['ori_shape'],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+        return preds
+
+    def whole_inference(self, img, img_meta, rescale):
+        """Inference with full image."""
+
+        seg_logit = self.encode_decode(img, img_meta)
+        if rescale:
+            # support dynamic shape for onnx
+            if torch.onnx.is_in_onnx_export():
+                size = img.shape[2:]
+            else:
+                size = img_meta[0]['ori_shape'][:2]
+            seg_logit = seg_resize(
+                seg_logit,
+                size=size,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+
+        return seg_logit
+
+    def inference(self, img, img_meta, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = img_meta[0]['ori_shape']
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if isinstance(ori_shape, torch.Tensor):
+            assert all(
+                tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape)
+                for _ in img_meta)
+        else:
+            assert all(_['ori_shape'] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(img, img_meta, rescale)
+        else:
+            seg_logit = self.whole_inference(img, img_meta, rescale)
+        output = F.softmax(seg_logit, dim=1)
+        flip = img_meta[0]['flip']
+        if flip:
+            flip_direction = img_meta[0]['flip_direction']
+            assert flip_direction in ['horizontal', 'vertical']
+            if flip_direction == 'horizontal':
+                output = output.flip(dims=(3, ))
+            elif flip_direction == 'vertical':
+                output = output.flip(dims=(2, ))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        seg_logit = self.inference(img, img_meta, rescale)
+        seg_pred = seg_logit.argmax(dim=1)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            seg_pred = seg_pred.unsqueeze(0)
+            return seg_pred
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(imgs)
+        seg_pred = seg_logit.argmax(dim=1)
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
new file mode 100644
index 00000000..dec8a5f2
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
@@ -0,0 +1,7 @@
+from .builder import build_pixel_sampler
+from .data_process_func import ResizeToMultiple
+from .seg_func import add_prefix, seg_resize
+
+__all__ = [
+    'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple'
+]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
new file mode 100644
index 00000000..63d77fea
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -0,0 +1,11 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+from mmcv.utils import Registry, build_from_cfg
+
+PIXEL_SAMPLERS = Registry('pixel sampler')
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
new file mode 100644
index 00000000..194361af
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class ResizeToMultiple(object):
+    """Resize images & seg to multiple of divisor.
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
new file mode 100644
index 00000000..fba46b81
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -0,0 +1,48 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import warnings
+
+import torch.nn.functional as F
+
+
+def seg_resize(input,
+               size=None,
+               scale_factor=None,
+               mode='nearest',
+               align_corners=None,
+               warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index d084a91b..f4b4ae3e 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
     from .image_panoptic_segmentation_pipeline import ImagePanopticSegmentationPipeline
     from .image_portrait_enhancement_pipeline import ImagePortraitEnhancementPipeline
     from .image_reid_person_pipeline import ImageReidPersonPipeline
+    from .image_semantic_segmentation_pipeline import ImageSemanticSegmentationPipeline
     from .image_style_transfer_pipeline import ImageStyleTransferPipeline
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
@@ -66,6 +67,8 @@ else:
         'image_portrait_enhancement_pipeline':
         ['ImagePortraitEnhancementPipeline'],
         'image_reid_person_pipeline': ['ImageReidPersonPipeline'],
+        'image_semantic_segmentation_pipeline':
+        ['ImageSemanticSegmentationPipeline'],
         'image_style_transfer_pipeline': ['ImageStyleTransferPipeline'],
         'image_super_resolution_pipeline': ['ImageSuperResolutionPipeline'],
         'image_to_image_translation_pipeline':
diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
new file mode 100644
index 00000000..e3e1fd6b
--- /dev/null
+++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_semantic_segmentation)
+class ImageSemanticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image semantic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('semantic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            # input is str, file names, pipeline loadimagefromfile
+            # collect data
+            data = dict(img_info=dict(filename=input), img_prefix=None)
+        elif isinstance(input, PIL.Image.Image):  # BGR
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input)[:, :, ::-1]
+            # collect data
+            data = dict(img=img)
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+            # collect data
+            data = dict(img=img)
+
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        # data = dict(img=input)
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        results = self.model.postprocess(inputs)
+        outputs = {
+            OutputKeys.MASKS: results[OutputKeys.MASKS],
+            OutputKeys.LABELS: results[OutputKeys.LABELS],
+            OutputKeys.SCORES: results[OutputKeys.SCORES]
+        }
+
+        return outputs
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index fca0e54f..9ded7ef3 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -153,3 +153,16 @@ def panoptic_seg_masks_to_image(masks):
         draw_img[mask] = color_mask
 
     return draw_img
+
+
+def semantic_seg_masks_to_image(masks):
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+    return draw_img
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
new file mode 100644
index 00000000..6738976c
--- /dev/null
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -0,0 +1,54 @@
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+class ImageSemanticSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_panmerge(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
+        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge_from_PIL DONE')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_vitadapter(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        model_id = 'damo/cv_vitadapter_semantic-segmentation_cocostuff164k'
+        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()

From b92e2ca0a05bf45012713ea4f425e5c6a00adf91 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Thu, 25 Aug 2022 21:26:51 +0800
Subject: [PATCH 007/175] [to #42322933] add vqa and caption finetuning for
 mplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 mplug 模型 caption 及 vqa 任务的 finetuning 支持
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9858028
---
 modelscope/metrics/builder.py                 |   2 +
 .../multi_modal/mplug/modeling_mplug.py       | 110 ++-------------
 .../models/multi_modal/mplug_for_all_tasks.py |  50 +++++--
 .../nlp/gpt3/gpt3_for_text_generation.py      |   3 +-
 .../nlp/palm_v2/palm_for_text_generation.py   |  15 +-
 modelscope/preprocessors/multi_modal.py       |  64 +++++----
 tests/trainers/test_finetune_mplug.py         | 128 ++++++++++++++++++
 7 files changed, 225 insertions(+), 147 deletions(-)
 create mode 100644 tests/trainers/test_finetune_mplug.py

diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index c76fe386..ad41fd87 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -30,6 +30,8 @@ task_default_metrics = {
     Tasks.image_portrait_enhancement:
     [Metrics.image_portrait_enhancement_metric],
     Tasks.video_summarization: [Metrics.video_summarization_metric],
+    Tasks.image_captioning: [Metrics.text_gen_metric],
+    Tasks.visual_question_answering: [Metrics.text_gen_metric],
 }
 
 
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 50622cc0..6311bd31 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1969,71 +1969,6 @@ class MPlug(PreTrainedModel):
                 [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
         return torch.index_select(x, dim, order_index.to(x.device))
 
-    def rank_answer(self, question_states, question_atts, answer_ids,
-                    answer_atts, k):
-
-        num_ques = question_states.size(0)
-        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
-
-        start_output = self.text_decoder(
-            start_ids,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            return_dict=True,
-            reduction='none')
-        logits = start_output.logits[:, 0, :]  # first token's logit
-
-        # topk_probs: top-k probability
-        # topk_ids: [num_question, k]
-        answer_first_token = answer_ids[:, 1]
-        prob_first_token = F.softmax(
-            logits, dim=1).index_select(
-                dim=1, index=answer_first_token)
-        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
-
-        # answer input: [num_question*k, answer_len]
-        input_ids = []
-        input_atts = []
-        for b, topk_id in enumerate(topk_ids):
-            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
-            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
-        input_ids = torch.cat(input_ids, dim=0)
-        input_atts = torch.cat(input_atts, dim=0)
-
-        targets_ids = input_ids.masked_fill(
-            input_ids == self.tokenizer.pad_token_id, -100)
-
-        # repeat encoder's output for top-k answers
-        question_states = self._tile(question_states, 0, k)
-        question_atts = self._tile(question_atts, 0, k)
-
-        output = self.text_decoder(
-            input_ids,
-            attention_mask=input_atts,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            labels=targets_ids,
-            return_dict=True,
-            reduction='none')
-
-        answer_loss = output.loss
-        answer_loss = answer_loss.view(input_ids.size(0), -1)
-
-        # topk_prob: first token probability
-        topk_probs = topk_probs.view(-1, 1)
-        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
-
-        # re-calculate log probabilities for the answer sequences using chain rule
-        log_probs_sum = log_probs.sum(1)
-        log_probs_sum = log_probs_sum.view(num_ques, k)
-
-        topk_probs = F.softmax(log_probs_sum, dim=-1)
-        # get top-k after re-ranking
-        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
-        topk_ids = torch.gather(topk_ids, 1, rerank_id)
-
-        return topk_ids, topk_probs
-
 
 class MPlugForVisualQuestionAnswering(MPlug):
 
@@ -2111,6 +2046,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
             merge_text_attention = torch.cat(
                 [image_atts, question.attention_mask], 1)
 
+            if k is None:
+                k = [1] * question_output.shape[0]
             question_states = []
             question_atts = []
             for b, n in enumerate(k):
@@ -2177,6 +2114,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
                     return_dict=True,
                     reduction='none',
                 )
+            if weights is None:
+                weights = 1
             loss = weights * answer_output.loss
             loss = loss.sum() / image.size(0)
 
@@ -2262,50 +2201,17 @@ class MPLUGForImageCaption(MPlug):
         if train:
             answer_targets = answer.input_ids.masked_fill(
                 answer.input_ids == self.tokenizer.pad_token_id, -100)
-            text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
-                return_dict=True)
-            text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
-                encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=False)
-
-            image_output, question_output = fusion_output
-
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-
             answer_output = self.text_decoder(
                 answer.input_ids,
                 attention_mask=answer.attention_mask,
-                encoder_hidden_states=question_output,
-                encoder_attention_mask=merge_text_attention,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
                 labels=answer_targets,
                 return_dict=True,
                 reduction='none')
             loss = answer_output.loss
+
             return loss
         else:
-            text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
-                return_dict=True)
-            text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
-                encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=False)
-            image_output, question_output = fusion_output
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-            topk_ids, topk_probs = self.generation(question_output,
-                                                   merge_text_attention)
+            topk_ids, topk_probs = self.generation(image_embeds, image_atts)
             return topk_ids, topk_probs
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index bb5a9c46..fb460714 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, List
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
@@ -25,12 +25,6 @@ class MPlugForAllTasks(TorchModel):
         self.model = MPlug.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
 
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """return the result by the model
 
@@ -45,13 +39,43 @@ class MPlugForAllTasks(TorchModel):
                     }
         """
 
-        topk_ids, _ = self.model(**input)
         replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
-        pred_string = self.tokenizer.decode(topk_ids[0][0])
-        for _old, _new in replace_tokens_bert:
-            pred_string = pred_string.replace(_old, _new)
-        pred_string = pred_string.strip()
-        return pred_string
+        if not self.training and 'answer_input_ids' not in input:
+            topk_ids, _ = self.model(**input)
+            pred_string: str = self.tokenizer.decode(topk_ids[0][0])
+            for _old, _new in replace_tokens_bert:
+                pred_string = pred_string.replace(_old, _new)
+            pred_string = pred_string.strip()
+            return pred_string
+        else:
+            import addict
+            question = addict.Dict(
+                input_ids=input['question_input_ids'],
+                attention_mask=input['question_attention_mask'])
+            answer = addict.Dict(
+                input_ids=input['answer_input_ids'],
+                attention_mask=input['answer_attention_mask'])
+            output = self.model(
+                input['image'], question, answer, train=self.training)
+            if self.training:
+                return {'loss': output}
+            topk_ids, _ = output
+            preds: List[str] = [
+                self.tokenizer.decode(batch[0]) for batch in topk_ids
+            ]
+            for i in range(len(preds)):
+                for _old, _new in replace_tokens_bert:
+                    preds[i] = preds[i].replace(_old, _new)
+                preds[i] = preds[i].strip()
+            tgts: List[str] = [
+                self.tokenizer.decode(batch)
+                for batch in input['answer_input_ids'].cpu().numpy().tolist()
+            ]
+            for i in range(len(tgts)):
+                for _old, _new in replace_tokens_bert:
+                    tgts[i] = tgts[i].replace(_old, _new)
+                preds[i] = preds[i].strip()
+            return {'preds': preds, 'tgts': tgts}
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index 7cff9ad4..fe1402e8 100644
--- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -60,5 +60,6 @@ class GPT3ForTextGeneration(TorchModel):
         sample_output = self.model.generate(**gen_params)
         return {
             OutputKeys.TEXT:
-            self.tokenizer.decode(sample_output[0], skip_special_tokens=True)
+            self.tokenizer.decode(sample_output[0],
+                                  skip_special_tokens=True).replace(' ', '')
         }
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index e432cc58..98aa56c7 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -29,20 +29,19 @@ class PalmForTextGeneration(TorchModel):
         self.generator = Translator(self.model)
 
     def _evaluate_postprocess(self, ids_list: List[List[int]]) -> List[str]:
-        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
-                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
-                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), ('[unused1]',
+                                                                  ''),
+                               (r' +', ' '), ('[SEP]', ''), ('[unused2]', ''),
+                               ('[CLS]', ''), ('[UNK]', ''), (' ', ''))
         replace_tokens_roberta = ((r' +', ' '), ('<mask>', '. '),
                                   ('<pad>', ''), ('<s>', ''), ('</s>', ''),
                                   ('<unk>', ' '), ('<q>', '. '))
 
+        replace_tokens = replace_tokens_roberta \
+            if self.model.config.encoder == 'roberta' else replace_tokens_bert
         strings = [self.tokenizer.decode(pred_ids) for pred_ids in ids_list]
-        for _old, _new in replace_tokens_bert:
+        for _old, _new in replace_tokens:
             strings = [s.replace(_old, _new) for s in strings]
-        for _old, _new in replace_tokens_roberta:
-            strings = [s.replace(_old, _new) for s in strings]
-        for s in strings:
-            s.strip()
         return strings
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 56b10c3a..4f0cb977 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -9,7 +9,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
 from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
@@ -91,9 +91,16 @@ class OfaPreprocessor(Preprocessor):
     Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
 class MPlugPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer_max_length: int = 25,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.model_dir = model_dir
+        self.mode = mode
+        self.tokenizer_max_length = tokenizer_max_length
 
         self._tokenizer = None
         self._patch_resize_transform = None
@@ -128,40 +135,51 @@ class MPlugPreprocessor(Preprocessor):
 
     def __call__(self, *args, **kwargs):
         call_mapping = {
-            Tasks.visual_question_answering: self.vqa_call,
-            Tasks.image_captioning: self.caption_call
+            Tasks.visual_question_answering: self.image_text_call,
+            Tasks.image_captioning: self.image_text_call,
         }
 
         self.cfg = Config.from_file(
             osp.join(self.model_dir, ModelFile.CONFIGURATION))
         return call_mapping[self.cfg.task](*args, **kwargs)
 
-    def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
-        image: Image.Image = data[0] if isinstance(data,
-                                                   tuple) else data['image']
-        question: str = data[1] if isinstance(data,
-                                              tuple) else data['question']
-        image = image.convert('RGB')
-        image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer([question.lower()],
-                                  padding='longest',
-                                  return_tensors='pt')
-
-        return {'image': image, 'question': question, 'train': False}
-
-    def caption_call(
+    def image_text_call(
             self, data: Union[Image.Image, tuple,
                               Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, Image.Image):
+        if isinstance(data, (Image.Image, str)):
             image = data
         elif isinstance(data, tuple):
             image = data[0]
         else:
             image = data['image']
+        if isinstance(image, str):
+            image = Image.open(image)
+        question = '' if self.cfg.task != Tasks.visual_question_answering \
+            else data[1 if isinstance(data, tuple) else 'question']
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer('', return_tensors='pt')
+        question = self.tokenizer(
+            question.lower(),
+            padding='max_length',
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors='pt')
 
-        return {'image': image, 'question': question, 'train': False}
+        if self.mode == ModeKeys.INFERENCE:
+            image = torch.stack([image], dim=0)
+            return {'image': image, 'question': question, 'train': False}
+        else:
+            answer = data['answer']
+            answer = self.tokenizer(
+                answer,
+                padding='max_length',
+                truncation=True,
+                max_length=self.tokenizer_max_length,
+                return_tensors='pt')
+            return {
+                'image': image,
+                'question_input_ids': question.input_ids.squeeze(),
+                'question_attention_mask': question.attention_mask.squeeze(),
+                'answer_input_ids': answer.input_ids.squeeze(),
+                'answer_attention_mask': answer.attention_mask.squeeze(),
+            }
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
new file mode 100644
index 00000000..5776141c
--- /dev/null
+++ b/tests/trainers/test_finetune_mplug.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from PIL import Image
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.multi_modal import MPlugForAllTasks
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneMPlug(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        datadict = MsDataset.load('coco_captions_small_slice')
+        self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+        self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_caption(self):
+
+        kwargs = dict(
+            model='damo/mplug_image-captioning_coco_base_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(3):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_caption_with_model_and_args(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        cache_path = snapshot_download(
+            'damo/mplug_image-captioning_coco_base_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=2,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_vqa(self):
+
+        kwargs = dict(
+            model='damo/mplug_visual-question-answering_coco_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(3):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_vqa_with_model_and_args(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        cache_path = snapshot_download(
+            'damo/mplug_visual-question-answering_coco_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=2,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8ac65d55866cc776942fae26562a4e3ad72ecefd Mon Sep 17 00:00:00 2001
From: "james.wjg" <james.wjg@alibaba-inc.com>
Date: Thu, 25 Aug 2022 22:00:03 +0800
Subject: [PATCH 008/175] =?UTF-8?q?[to=20#42322933]=20video=5Fsummarizatio?=
 =?UTF-8?q?n=20=E4=BF=AE=E6=94=B9test=E4=B8=AD=E7=9A=84=E7=BB=93=E6=9E=9C?=
 =?UTF-8?q?=E5=8F=AF=E8=A7=86=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

video_summarization 修改test中的结果可视化
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9902499
---
 .../cv/video_summarization_pipeline.py        |  2 +-
 modelscope/utils/cv/image_utils.py            | 21 +++++++++++++++++++
 tests/pipelines/test_video_summarization.py   | 20 ++++++++++--------
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
index 9ed9c867..001780e1 100644
--- a/modelscope/pipelines/cv/video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -106,4 +106,4 @@ class VideoSummarizationPipeline(Pipeline):
             summary = generate_summary([change_points], [scores], [n_frames],
                                        [picks])[0]
 
-        return summary
+        return summary.tolist()
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 9ded7ef3..0ad0ef8f 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -166,3 +166,24 @@ def semantic_seg_masks_to_image(masks):
         mask = mask.astype(bool)
         draw_img[mask] = color_mask
     return draw_img
+
+
+def show_video_summarization_result(video_in_path, result, video_save_path):
+    frame_indexes = result[OutputKeys.OUTPUT]
+    cap = cv2.VideoCapture(video_in_path)
+    for i in range(len(frame_indexes)):
+        idx = frame_indexes[i]
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(video_in_path,
+                            ' can not be correctly decoded by OpenCV.')
+        if i == 0:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        if idx == 1:
+            video_writer.write(frame)
+    video_writer.release()
+    cap.release()
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 36724332..12a0ee07 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -3,6 +3,7 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_summarization_result
 from modelscope.utils.test_utils import test_level
 
 
@@ -10,22 +11,23 @@ class VideoSummarizationTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-
+        model_id = 'damo/cv_googlenet_pgl-video-summarization'
+        video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(
-            Tasks.video_summarization,
-            model='damo/cv_googlenet_pgl-video-summarization')
-        result = summarization_pipeline(
-            'data/test/videos/video_category_test_video.mp4')
+            Tasks.video_summarization, model=model_id)
+        result = summarization_pipeline(video_path)
 
-        print(f'video summarization output: {result}.')
+        print(f'video summarization output: \n{result}.')
+        show_video_summarization_result(video_path, result,
+                                        './summarization_result.avi')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
+        video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(Tasks.video_summarization)
-        result = summarization_pipeline(
-            'data/test/videos/video_category_test_video.mp4')
+        result = summarization_pipeline(video_path)
 
-        print(f'video summarization output: {result}.')
+        print(f'video summarization output:\n {result}.')
 
 
 if __name__ == '__main__':

From 44033290d4788a2a1a14d75410ec44f19fe243d2 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Thu, 25 Aug 2022 22:28:10 +0800
Subject: [PATCH 009/175] =?UTF-8?q?[to=20#42322933]MsDataset=20=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E4=B8=8A=E4=BC=A0=E6=95=B0=E6=8D=AE=E9=9B=86=E5=8E=8B?=
 =?UTF-8?q?=E7=BC=A9=E5=8C=85=E5=92=8Cmeta?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. MsDataset支持upload数据文件(压缩包)
2. MsDataset支持clone和upload meta data
3. 使用MsDataset.load()下载数据集，支持web端显示数据集下载计数
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9831232
---
 .dev_scripts/dockerci.sh                      |   1 +
 modelscope/hub/api.py                         |  34 ++++-
 modelscope/hub/repository.py                  | 120 ++++++++++++++++--
 modelscope/hub/utils/utils.py                 |   8 +-
 modelscope/msdatasets/ms_dataset.py           | 117 ++++++++++++++++-
 modelscope/msdatasets/utils/oss_utils.py      |  33 ++++-
 modelscope/msdatasets/utils/upload_utils.py   |  23 ++++
 .../config.py => utils/config_ds.py}          |   0
 modelscope/utils/constant.py                  |   1 +
 tests/msdatasets/test_dataset_upload.py       |  95 ++++++++++++++
 tests/msdatasets/test_ms_dataset.py           |   4 +-
 11 files changed, 407 insertions(+), 29 deletions(-)
 create mode 100644 modelscope/msdatasets/utils/upload_utils.py
 rename modelscope/{msdatasets/config.py => utils/config_ds.py} (100%)
 create mode 100644 tests/msdatasets/test_dataset_upload.py

diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index 383eb909..95dd0e1a 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -32,6 +32,7 @@ do
              -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
              -e TEST_LEVEL=$TEST_LEVEL \
+             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
              --workdir=$CODE_DIR_IN_CONTAINER \
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 09bff2c1..721f5637 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,7 +1,6 @@
 import os
 import pickle
 import shutil
-import subprocess
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
@@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH)
-from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
-                                          HUB_DATASET_ENDPOINT)
+from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DatasetFormations, DatasetMetaFormats,
@@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                      datahub_raise_on_error, handle_http_response, is_ok,
                      raise_on_error)
-from .utils.utils import get_endpoint, model_id_to_group_owner_name
+from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
+                          model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -35,7 +34,8 @@ class HubApi:
 
     def __init__(self, endpoint=None, dataset_endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
-        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT
+        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
+        )
 
     def login(
         self,
@@ -376,6 +376,27 @@ class HubApi:
                       f'ststoken?Revision={revision}'
         return self.datahub_remote_call(datahub_url)
 
+    def get_dataset_access_config_session(
+            self,
+            cookies: CookieJar,
+            dataset_name: str,
+            namespace: str,
+            revision: Optional[str] = DEFAULT_DATASET_REVISION):
+
+        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+                      f'ststoken?Revision={revision}'
+
+        cookies = requests.utils.dict_from_cookiejar(cookies)
+        r = requests.get(url=datahub_url, cookies=cookies)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        return resp['Data']
+
+    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
+        r = requests.post(url)
+        r.raise_for_status()
+
     @staticmethod
     def datahub_remote_call(url):
         r = requests.get(url)
@@ -383,6 +404,9 @@ class HubApi:
         datahub_raise_on_error(url, resp)
         return resp['Data']
 
+    def check_cookies_upload_data(self, use_cookies) -> CookieJar:
+        return self._check_cookie(use_cookies=use_cookies)
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 51ddf954..6f560f7a 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -2,7 +2,8 @@ import os
 from typing import Optional
 
 from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION)
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .git import GitCommandWrapper
@@ -15,14 +16,12 @@ class Repository:
     """A local representation of the model git repository.
     """
 
-    def __init__(
-        self,
-        model_dir: str,
-        clone_from: str,
-        revision: Optional[str] = DEFAULT_MODEL_REVISION,
-        auth_token: Optional[str] = None,
-        git_path: Optional[str] = None,
-    ):
+    def __init__(self,
+                 model_dir: str,
+                 clone_from: str,
+                 revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
         """
         Instantiate a Repository object by cloning the remote ModelScopeHub repo
         Args:
@@ -86,6 +85,7 @@ class Repository:
              branch: Optional[str] = DEFAULT_MODEL_REVISION,
              force: bool = False):
         """Push local files to remote, this method will do.
+           git pull
            git add
            git commit
            git push
@@ -117,3 +117,105 @@ class Repository:
             url=url,
             local_branch=branch,
             remote_branch=branch)
+
+
+class DatasetRepository:
+    """A local representation of the dataset (metadata) git repository.
+    """
+
+    def __init__(self,
+                 repo_work_dir: str,
+                 dataset_id: str,
+                 revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
+        """
+        Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
+        Args:
+            repo_work_dir(`str`):
+                The dataset repo root directory.
+            dataset_id:
+                dataset id in ModelScope from which git clone
+            revision(`Optional[str]`):
+                revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        """
+        self.dataset_id = dataset_id
+        self.repo_work_dir = repo_work_dir
+        self.repo_base_dir = os.path.dirname(repo_work_dir)
+        self.repo_name = os.path.basename(repo_work_dir)
+        self.revision = revision
+        if auth_token:
+            self.auth_token = auth_token
+        else:
+            self.auth_token = ModelScopeConfig.get_token()
+
+        self.git_wrapper = GitCommandWrapper(git_path)
+        os.makedirs(self.repo_work_dir, exist_ok=True)
+        self.repo_url = self._get_repo_url(dataset_id=dataset_id)
+
+    def clone(self) -> str:
+        # check local repo dir, directory not empty.
+        if os.listdir(self.repo_work_dir):
+            remote_url = self._get_remote_url()
+            remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+            # no need clone again
+            if remote_url and remote_url == self.repo_url:
+                return ''
+
+        logger.info('Cloning repo from {} '.format(self.repo_url))
+        self.git_wrapper.clone(self.repo_base_dir, self.auth_token,
+                               self.repo_url, self.repo_name, self.revision)
+        return self.repo_work_dir
+
+    def push(self,
+             commit_message: str,
+             branch: Optional[str] = DEFAULT_DATASET_REVISION,
+             force: bool = False):
+        """Push local files to remote, this method will do.
+           git pull
+           git add
+           git commit
+           git push
+        Args:
+            commit_message (str): commit message
+            branch (Optional[str], optional): which branch to push.
+            force (Optional[bool]): whether to use forced-push.
+        """
+        if commit_message is None or not isinstance(commit_message, str):
+            msg = 'commit_message must be provided!'
+            raise InvalidParameter(msg)
+
+        if not isinstance(force, bool):
+            raise InvalidParameter('force must be bool')
+
+        if not self.auth_token:
+            raise NotLoginException('Must login to push, please login first.')
+
+        self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
+        self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)
+
+        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        self.git_wrapper.pull(self.repo_work_dir)
+        self.git_wrapper.add(self.repo_work_dir, all_files=True)
+        self.git_wrapper.commit(self.repo_work_dir, commit_message)
+        self.git_wrapper.push(
+            repo_dir=self.repo_work_dir,
+            token=self.auth_token,
+            url=remote_url,
+            local_branch=branch,
+            remote_branch=branch)
+
+    def _get_repo_url(self, dataset_id):
+        return f'{get_endpoint()}/datasets/{dataset_id}.git'
+
+    def _get_remote_url(self):
+        try:
+            remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        except GitError:
+            remote = None
+        return remote
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 1a55c9f9..8faf8f1d 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,7 +1,8 @@
 import hashlib
 import os
 
-from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
+                                      DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
@@ -38,6 +39,11 @@ def get_endpoint():
     return MODELSCOPE_URL_SCHEME + modelscope_domain
 
 
+def get_dataset_hub_endpoint():
+    return os.environ.get('HUB_DATASET_ENDPOINT',
+                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)
+
+
 def compute_hash(file_path):
     BUFFER_SIZE = 1024 * 64  # 64k buffer size
     sha256_hash = hashlib.sha256()
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 6e4486dd..454044a4 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -12,9 +12,11 @@ from datasets.utils.download_manager import DownloadConfig
 from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
-from modelscope.msdatasets.config import MS_DATASETS_CACHE
+from modelscope.hub.repository import DatasetRepository
 from modelscope.utils.config import ConfigDict
-from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+from modelscope.utils.config_ds import MS_DATASETS_CACHE
+from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
+                                       DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
 from .task_datasets.builder import build_task_dataset
@@ -23,6 +25,7 @@ from .utils.dataset_utils import (get_dataset_files,
                                   get_target_dataset_structure,
                                   load_dataset_builder)
 from .utils.download_utils import DatasetDownloadManager
+from .utils.upload_utils import DatasetUploadManager
 
 logger = get_logger()
 
@@ -97,7 +100,7 @@ class MsDataset:
     @staticmethod
     def load(
         dataset_name: Union[str, list],
-        namespace: Optional[str] = 'modelscope',
+        namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
         target: Optional[str] = None,
         version: Optional[str] = DEFAULT_DATASET_REVISION,
         hub: Optional[Hubs] = Hubs.modelscope,
@@ -171,15 +174,17 @@ class MsDataset:
                              Mapping[str, Union[str, Sequence[str]]]]] = None,
                          download_mode: Optional[DownloadMode] = None,
                          **config_kwargs) -> Union[dict, 'MsDataset']:
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        download_dataset = ''
         if isinstance(dataset_name, str):
+            download_dataset = dataset_name
             dataset_formation = DatasetFormations.native
             if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                     (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                 dataset_formation = DatasetFormations.hf_compatible
             elif is_relative_path(dataset_name) and dataset_name.count(
                     '/') == 0:
-                from modelscope.hub.api import HubApi
-                api = HubApi()
                 dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
                     dataset_name, namespace, download_mode, version)
                 # dataset organized to be compatible with hf format
@@ -219,6 +224,11 @@ class MsDataset:
         else:
             raise TypeError('path must be a str or a list, but got'
                             f' {type(dataset_name)}')
+
+        if download_dataset:
+            api.on_dataset_download(
+                dataset_name=download_dataset, namespace=namespace)
+
         return MsDataset.from_hf_dataset(dataset, target=target)
 
     @staticmethod
@@ -539,3 +549,100 @@ class MsDataset:
     def to_hf_dataset(self) -> Dataset:
         self._hf_ds.reset_format()
         return self._hf_ds
+
+    @staticmethod
+    def upload(object_name: str,
+               local_file_path: str,
+               dataset_name: str,
+               namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
+               version: Optional[str] = DEFAULT_DATASET_REVISION) -> None:
+        """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first.
+
+        Args:
+            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip
+            local_file_path (str): Local file to upload
+            dataset_name (str): Name of the dataset
+            namespace(str, optional): Namespace of the dataset
+            version: Optional[str]: Version of the dataset
+
+        Returns:
+            None
+
+        """
+        from modelscope.hub.api import HubApi
+        _hub_api = HubApi()
+        cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
+        _upload_manager = DatasetUploadManager(
+            dataset_name=dataset_name,
+            namespace=namespace,
+            version=version,
+            cookies=cookies)
+        _upload_manager.upload(object_name, local_file_path)
+
+    @staticmethod
+    def clone_meta(dataset_work_dir: str,
+                   dataset_id: str,
+                   revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                   auth_token: Optional[str] = None,
+                   git_path: Optional[str] = None) -> None:
+        """Clone meta-file of dataset from the ModelScope Hub.
+        Args:
+            dataset_work_dir (str): Current git working directory.
+            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        Returns:
+            None
+        """
+
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id=dataset_id,
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        clone_work_dir = _repo.clone()
+        if clone_work_dir:
+            logger.info('Already cloned repo to: {}'.format(clone_work_dir))
+        else:
+            logger.warning('The repo working dir is already ex.')
+
+    @staticmethod
+    def upload_meta(dataset_work_dir: str,
+                    dataset_id: str,
+                    commit_message: str,
+                    revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                    auth_token: Optional[str] = None,
+                    git_path: Optional[str] = None,
+                    force: bool = False) -> None:
+        """Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.
+
+        Args:
+            dataset_work_dir (str): Current working directory.
+            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            commit_message (str): Commit message.
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+            force (Optional[bool]): whether to use forced-push.
+
+        Returns:
+            None
+
+        """
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id=dataset_id,
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        _repo.push(commit_message=commit_message, branch=revision, force=force)
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 83cfc7dd..033c8b96 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,6 +1,5 @@
 from __future__ import print_function
 import os
-import sys
 
 import oss2
 from datasets.utils.file_utils import hash_url_to_filename
@@ -19,6 +18,12 @@ class OssUtilities:
         self.oss_dir = oss_config['Dir']
         self.oss_backup_dir = oss_config['BackupDir']
 
+    @staticmethod
+    def _percentage(consumed_bytes, total_bytes):
+        if total_bytes:
+            rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
+            print('\r{0}% '.format(rate), end='', flush=True)
+
     def download(self, oss_file_name, cache_dir):
         candidate_key = os.path.join(self.oss_dir, oss_file_name)
         candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
@@ -27,11 +32,25 @@ class OssUtilities:
         filename = hash_url_to_filename(file_oss_key, etag=None)
         local_path = os.path.join(cache_dir, filename)
 
-        def percentage(consumed_bytes, total_bytes):
-            if total_bytes:
-                rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
-                print('\r{0}% '.format(rate), end='', flush=True)
-
         self.bucket.get_object_to_file(
-            file_oss_key, local_path, progress_callback=percentage)
+            file_oss_key, local_path, progress_callback=self._percentage)
         return local_path
+
+    def upload(self, oss_file_name: str, local_file_path: str) -> str:
+        max_retries = 3
+        retry_count = 0
+        object_key = os.path.join(self.oss_dir, oss_file_name)
+
+        while True:
+            try:
+                retry_count += 1
+                self.bucket.put_object_from_file(
+                    object_key,
+                    local_file_path,
+                    progress_callback=self._percentage)
+                break
+            except Exception:
+                if retry_count >= max_retries:
+                    raise
+
+        return object_key
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
new file mode 100644
index 00000000..eff3aca0
--- /dev/null
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -0,0 +1,23 @@
+from http.cookiejar import CookieJar
+
+from .oss_utils import OssUtilities
+
+
+class DatasetUploadManager(object):
+
+    def __init__(self, dataset_name: str, namespace: str, version: str,
+                 cookies: CookieJar):
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        oss_config = api.get_dataset_access_config_session(
+            cookies=cookies,
+            dataset_name=dataset_name,
+            namespace=namespace,
+            revision=version)
+
+        self.oss_utilities = OssUtilities(oss_config)
+
+    def upload(self, oss_file_name: str, local_file_path: str) -> str:
+        oss_object_key = self.oss_utilities.upload(
+            oss_file_name=oss_file_name, local_file_path=local_file_path)
+        return oss_object_key
diff --git a/modelscope/msdatasets/config.py b/modelscope/utils/config_ds.py
similarity index 100%
rename from modelscope/msdatasets/config.py
rename to modelscope/utils/config_ds.py
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 81712983..4ef34812 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -254,6 +254,7 @@ class Frameworks(object):
 
 DEFAULT_MODEL_REVISION = 'master'
 DEFAULT_DATASET_REVISION = 'master'
+DEFAULT_DATASET_NAMESPACE = 'modelscope'
 
 
 class ModeKeys:
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
new file mode 100644
index 00000000..61b1c6a4
--- /dev/null
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.msdatasets import MsDataset
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+KEY_EXTRACTED = 'extracted'
+
+
+class DatasetUploadTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_dir = os.getcwd()
+        self.dataset_name = 'small_coco_for_test'
+        self.dataset_file_name = self.dataset_name
+        self.prepared_dataset_name = 'pets_small'
+        self.token = os.getenv('TEST_UPLOAD_MS_TOKEN')
+        error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN'
+        self.assertIsNotNone(self.token, msg=error_msg)
+        from modelscope.hub.api import HubApi
+        from modelscope.hub.api import ModelScopeConfig
+        self.api = HubApi()
+        self.api.login(self.token)
+
+        # get user info
+        self.namespace, _ = ModelScopeConfig.get_user_info()
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name)
+        self.test_meta_dir = os.path.join(self.test_work_dir, 'meta')
+        if not os.path.exists(self.test_work_dir):
+            os.makedirs(self.test_work_dir)
+
+    def tearDown(self):
+        os.chdir(self.old_dir)
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        print('The test dir successfully removed!')
+
+    @staticmethod
+    def get_raw_downloaded_file_path(extracted_path):
+        raw_downloaded_file_path = ''
+        raw_data_dir = os.path.abspath(
+            os.path.join(extracted_path, '../../..'))
+        for root, dirs, files in os.walk(raw_data_dir):
+            if KEY_EXTRACTED in dirs:
+                for file in files:
+                    curr_file_path = os.path.join(root, file)
+                    if zipfile.is_zipfile(curr_file_path):
+                        raw_downloaded_file_path = curr_file_path
+        return raw_downloaded_file_path
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload(self):
+        # Get the prepared data from hub, using default modelscope namespace
+        ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
+        config_res = ms_ds_train._hf_ds.config_kwargs
+        extracted_path = config_res.get('split_config').get('train')
+        raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path)
+
+        MsDataset.upload(
+            object_name=self.dataset_file_name + '.zip',
+            local_file_path=raw_zipfile_path,
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_clone_meta(self):
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload_meta(self):
+        # Clone dataset meta repo first.
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+        with open(os.path.join(self.test_meta_dir, ModelFile.README),
+                  'a') as f:
+            f.write('\nThis is a line for unit test.')
+
+        MsDataset.upload_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name),
+            commit_message='Update for unit test.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index f9118353..0d8c8a4d 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -4,7 +4,7 @@ from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.test_utils import require_tf, require_torch, test_level
 
 
@@ -35,7 +35,7 @@ class MsDatasetTest(unittest.TestCase):
     def test_coco(self):
         ms_ds_train = MsDataset.load(
             'pets_small',
-            namespace='modelscope',
+            namespace=DEFAULT_DATASET_NAMESPACE,
             split='train',
             download_mode=DownloadMode.FORCE_REDOWNLOAD,
             classes=('1', '2'))

From 83b0adf0a2391a8459b28685d843970fcdbcb310 Mon Sep 17 00:00:00 2001
From: pangda <pangda@alibaba-inc.com>
Date: Thu, 25 Aug 2022 23:04:14 +0800
Subject: [PATCH 010/175] [to #42322933] fix bug for multi-lang text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

支持多语言tokenize（830模型）
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9900916
---
 modelscope/preprocessors/nlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 25576667..222a219a 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -533,7 +533,7 @@ class NERPreprocessor(Preprocessor):
         self.model_dir: str = model_dir
         self.sequence_length = kwargs.pop('sequence_length', 512)
         self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=False)
+            model_dir, use_fast=True)
         self.is_split_into_words = self.tokenizer.init_kwargs.get(
             'is_split_into_words', False)
 

From 52f581d7d52452af89bab9761080b1195f85af4d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 26 Aug 2022 11:51:42 +0800
Subject: [PATCH 011/175] [to #43115513] bump version to 0.3.7

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 40ed83d9..d93912ee 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.5'
+__version__ = '0.3.7'

From dc45fce542abab709bb0559248ba2712224a9df6 Mon Sep 17 00:00:00 2001
From: "tanfan.zjh" <tanfan.zjh@alibaba-inc.com>
Date: Fri, 26 Aug 2022 13:06:41 +0800
Subject: [PATCH 012/175] =?UTF-8?q?[to=20#42322933]=E6=96=B0=E5=A2=9EFAQ?=
 =?UTF-8?q?=E9=97=AE=E7=AD=94=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maas新增FAQ问答模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9797053
---
 modelscope/hub/errors.py                      |   4 +-
 modelscope/metainfo.py                        |   2 +
 .../skin_retouching/retinaface/box_utils.py   |   3 +-
 modelscope/models/nlp/__init__.py             |   2 +
 .../nlp/sbert_for_faq_question_answering.py   | 249 ++++++++++++++++++
 modelscope/outputs.py                         |  11 +
 modelscope/pipelines/builder.py               |   4 +-
 modelscope/pipelines/nlp/__init__.py          |   4 +-
 .../nlp/faq_question_answering_pipeline.py    |  76 ++++++
 modelscope/preprocessors/__init__.py          |   6 +-
 modelscope/preprocessors/nlp.py               |  87 +++++-
 modelscope/utils/constant.py                  |   1 +
 .../pipelines/test_faq_question_answering.py  |  85 ++++++
 13 files changed, 526 insertions(+), 8 deletions(-)
 create mode 100644 modelscope/models/nlp/sbert_for_faq_question_answering.py
 create mode 100644 modelscope/pipelines/nlp/faq_question_answering_pipeline.py
 create mode 100644 tests/pipelines/test_faq_question_answering.py

diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index ecd4e1da..e9c008b0 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -49,8 +49,8 @@ def handle_http_response(response, logger, cookies, model_id):
     except HTTPError:
         if cookies is None:  # code in [403] and
             logger.error(
-                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \
-                  Please login first.')
+                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
+                private. Please login first.')
         raise
 
 
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 8e21c00b..6ea03610 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -138,6 +138,7 @@ class Pipelines(object):
     dialog_state_tracking = 'dialog-state-tracking'
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
 
     # audio tasks
@@ -220,6 +221,7 @@ class Preprocessors(object):
     text_error_correction = 'text-error-correction'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
+    faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
 
     # audio preprocessor
diff --git a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
index 89cf8bf6..a4aeffd1 100644
--- a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
+++ b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
@@ -6,7 +6,8 @@ import torch
 
 
 def point_form(boxes: torch.Tensor) -> torch.Tensor:
-    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form ground truth data.
+    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form \
+       ground truth data.
 
     Args:
         boxes: center-size default boxes from priorbox layers.
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 3fd76f98..13be9096 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
     from .task_models.task_model import SingleBackboneTaskModelBase
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
+    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
 
 else:
     _import_structure = {
@@ -44,6 +45,7 @@ else:
         'task_model': ['SingleBackboneTaskModelBase'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'gpt3': ['GPT3ForTextGeneration'],
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering']
     }
 
     import sys
diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/sbert_for_faq_question_answering.py
new file mode 100644
index 00000000..23ccdcc5
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_faq_question_answering.py
@@ -0,0 +1,249 @@
+import math
+import os
+from collections import namedtuple
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertConfig, SbertModel
+from modelscope.models.nlp.task_models.task_model import BaseTaskModel
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['SbertForFaqQuestionAnswering']
+
+
+class SbertForFaqQuestionAnsweringBase(BaseTaskModel):
+    """base class for faq models
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super(SbertForFaqQuestionAnsweringBase,
+              self).__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = SbertConfig.from_pretrained(model_dir)
+        self.bert = SbertModel(backbone_cfg)
+
+        model_config = Config.from_file(
+            os.path.join(model_dir,
+                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+
+        metric = model_config.get('metric', 'cosine')
+        pooling_method = model_config.get('pooling', 'avg')
+
+        Arg = namedtuple('args', [
+            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
+        ])
+        args = Arg(
+            metrics=metric,
+            proj_hidden_size=self.bert.config.hidden_size,
+            hidden_size=self.bert.config.hidden_size,
+            dropout=0.0,
+            pooling=pooling_method)
+
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def _get_onehot_labels(self, labels, support_size, num_cls):
+        labels_ = labels.view(support_size, 1)
+        target_oh = torch.zeros(support_size, num_cls).to(labels)
+        target_oh.scatter_(dim=1, index=labels_, value=1)
+        return target_oh.view(support_size, num_cls).float()
+
+    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.pooling(last_hidden_states, input_mask)
+        return pooled_representation
+
+
+@MODELS.register_module(
+    Tasks.faq_question_answering, module_name=Models.structbert)
+class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase):
+    _backbone_prefix = ''
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        assert not self.training
+        query = input['query']
+        support = input['support']
+        if isinstance(query, list):
+            query = torch.stack(query)
+        if isinstance(support, list):
+            support = torch.stack(support)
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        query_mask = torch.ne(query, 0).view([n_query, -1])
+        support_mask = torch.ne(support, 0).view([n_support, -1])
+
+        support_labels = input['support_labels']
+        num_cls = torch.max(support_labels) + 1
+        onehot_labels = self._get_onehot_labels(support_labels, n_support,
+                                                num_cls)
+
+        input_ids = torch.cat([query, support])
+        input_mask = torch.cat([query_mask, support_mask], dim=0)
+        pooled_representation = self.forward_sentence_embedding({
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            input_mask
+        })
+        z_query = pooled_representation[:n_query]
+        z_support = pooled_representation[n_query:]
+        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
+        protos = torch.matmul(onehot_labels.transpose(0, 1),
+                              z_support) / cls_n_support.unsqueeze(-1)
+        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        if self.metrics_layer.name == 'relation':
+            scores = torch.sigmoid(scores)
+        return {'scores': scores}
+
+
+activations = {
+    'relu': F.relu,
+    'tanh': torch.tanh,
+    'linear': lambda x: x,
+}
+
+activation_coeffs = {
+    'relu': math.sqrt(2),
+    'tanh': 5 / 3,
+    'linear': 1.,
+}
+
+
+class LinearProjection(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 activation='linear',
+                 bias=True):
+        super().__init__()
+        self.activation = activations[activation]
+        activation_coeff = activation_coeffs[activation]
+        linear = nn.Linear(in_features, out_features, bias=bias)
+        nn.init.normal_(
+            linear.weight, std=math.sqrt(1. / in_features) * activation_coeff)
+        if bias:
+            nn.init.zeros_(linear.bias)
+        self.model = nn.utils.weight_norm(linear)
+
+    def forward(self, x):
+        return self.activation(self.model(x))
+
+
+class RelationModule(nn.Module):
+
+    def __init__(self, args):
+        super(RelationModule, self).__init__()
+        input_size = args.proj_hidden_size * 4
+        self.prediction = torch.nn.Sequential(
+            LinearProjection(
+                input_size, args.proj_hidden_size * 4, activation='relu'),
+            nn.Dropout(args.dropout),
+            LinearProjection(args.proj_hidden_size * 4, 1))
+
+    def forward(self, query, protos):
+        n_cls = protos.shape[0]
+        n_query = query.shape[0]
+        protos = protos.unsqueeze(0).repeat(n_query, 1, 1)
+        query = query.unsqueeze(1).repeat(1, n_cls, 1)
+        input_feat = torch.cat(
+            [query, protos, (protos - query).abs(), query * protos], dim=-1)
+        dists = self.prediction(input_feat)  # [bsz,n_query,n_cls,1]
+        return dists.squeeze(-1)
+
+
+class MetricsLayer(nn.Module):
+
+    def __init__(self, args):
+        super(MetricsLayer, self).__init__()
+        self.args = args
+        assert args.metrics in ('relation', 'cosine')
+        if args.metrics == 'relation':
+            self.relation_net = RelationModule(args)
+
+    @property
+    def name(self):
+        return self.args.metrics
+
+    def forward(self, query, protos):
+        """ query : [bsz, n_query, dim]
+            support : [bsz, n_query, n_cls, dim] | [bsz, n_cls, dim]
+        """
+        if self.args.metrics == 'cosine':
+            supervised_dists = self.cosine_similarity(query, protos)
+            if self.training:
+                supervised_dists *= 5
+        elif self.args.metrics in ('relation', ):
+            supervised_dists = self.relation_net(query, protos)
+        else:
+            raise NotImplementedError
+        return supervised_dists
+
+    def cosine_similarity(self, x, y):
+        # x=[bsz, n_query, dim]
+        # y=[bsz, n_cls, dim]
+        n_query = x.shape[0]
+        n_cls = y.shape[0]
+        dim = x.shape[-1]
+        x = x.unsqueeze(1).expand([n_query, n_cls, dim])
+        y = y.unsqueeze(0).expand([n_query, n_cls, dim])
+        return F.cosine_similarity(x, y, -1)
+
+
+class AveragePooling(nn.Module):
+
+    def forward(self, x, mask, dim=1):
+        return torch.sum(
+            x * mask.float(), dim=dim) / torch.sum(
+                mask.float(), dim=dim)
+
+
+class AttnPooling(nn.Module):
+
+    def __init__(self, input_size, hidden_size=None, output_size=None):
+        super().__init__()
+        self.input_proj = nn.Sequential(
+            LinearProjection(input_size, hidden_size), nn.Tanh(),
+            LinearProjection(hidden_size, 1, bias=False))
+        self.output_proj = LinearProjection(
+            input_size, output_size) if output_size else lambda x: x
+
+    def forward(self, x, mask):
+        score = self.input_proj(x)
+        score = score * mask.float() + -1e4 * (1. - mask.float())
+        score = F.softmax(score, dim=1)
+        features = self.output_proj(x)
+        return torch.matmul(score.transpose(1, 2), features).squeeze(1)
+
+
+class PoolingLayer(nn.Module):
+
+    def __init__(self, args):
+        super(PoolingLayer, self).__init__()
+        if args.pooling == 'attn':
+            self.pooling = AttnPooling(args.proj_hidden_size,
+                                       args.proj_hidden_size,
+                                       args.proj_hidden_size)
+        elif args.pooling == 'avg':
+            self.pooling = AveragePooling()
+        else:
+            raise NotImplementedError(args.pooling)
+
+    def forward(self, x, mask):
+        return self.pooling(x, mask)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 640d67fa..2edd76a2 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -7,6 +7,7 @@ class OutputKeys(object):
     LOSS = 'loss'
     LOGITS = 'logits'
     SCORES = 'scores'
+    SCORE = 'score'
     LABEL = 'label'
     LABELS = 'labels'
     INPUT_IDS = 'input_ids'
@@ -504,6 +505,16 @@ TASK_OUTPUTS = {
     # }
     Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
 
+    # {
+    #   'output': [
+    #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
+    #      {'label': '13421097', 'score': 2.2825044965202324e-08}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
+    # }
+    Tasks.faq_question_answering: [OutputKeys.OUTPUT],
     # image person reid result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 52dfa41b..fa6705a7 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -129,6 +129,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_convnextTiny_ocr-recognition-general_damo'),
     Tasks.skin_retouching: (Pipelines.skin_retouching,
                             'damo/cv_unet_skin-retouching'),
+    Tasks.faq_question_answering:
+    (Pipelines.faq_question_answering,
+     'damo/nlp_structbert_faq-question-answering_chinese-base'),
     Tasks.crowd_counting: (Pipelines.crowd_counting,
                            'damo/cv_hrnet_crowd-counting_dcanet'),
     Tasks.video_single_object_tracking:
@@ -218,7 +221,6 @@ def pipeline(task: str = None,
         f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
 
     model = normalize_model_input(model, model_revision)
-
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 0cdb633c..51803872 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
+    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
 
 else:
     _import_structure = {
@@ -44,7 +45,8 @@ else:
         'translation_pipeline': ['TranslationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
-        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline']
+        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
new file mode 100644
index 00000000..65831a17
--- /dev/null
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -0,0 +1,76 @@
+from typing import Any, Dict, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['FaqQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.faq_question_answering, module_name=Pipelines.faq_question_answering)
+class FaqQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[str, SbertForFaqQuestionAnswering],
+                 preprocessor: FaqQuestionAnsweringPreprocessor = None,
+                 **kwargs):
+        model = model if isinstance(
+            model,
+            SbertForFaqQuestionAnswering) else Model.from_pretrained(model)
+        model.eval()
+        if preprocessor is None:
+            preprocessor = FaqQuestionAnsweringPreprocessor(
+                model.model_dir, **kwargs)
+        self.preprocessor = preprocessor
+        super(FaqQuestionAnsweringPipeline, self).__init__(
+            model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return pipeline_parameters, pipeline_parameters, pipeline_parameters
+
+    def get_sentence_embedding(self, inputs, max_len=None):
+        inputs = self.preprocessor.batch_encode(inputs, max_length=max_len)
+        sentence_vecs = self.model.forward_sentence_embedding(inputs)
+        sentence_vecs = sentence_vecs.detach().tolist()
+        return sentence_vecs
+
+    def forward(self, inputs: [list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs)
+
+    def postprocess(self, inputs: [list, Dict[str, Any]],
+                    **postprocess_params) -> Dict[str, Any]:
+        scores = inputs['scores']
+        labels = []
+        for item in scores:
+            tmplabels = [
+                self.preprocessor.get_label(label_id)
+                for label_id in range(len(item))
+            ]
+            labels.append(tmplabels)
+
+        predictions = []
+        for tmp_scores, tmp_labels in zip(scores.tolist(), labels):
+            prediction = []
+            for score, label in zip(tmp_scores, tmp_labels):
+                prediction.append({
+                    OutputKeys.LABEL: label,
+                    OutputKeys.SCORE: score
+                })
+            predictions.append(
+                list(
+                    sorted(
+                        prediction,
+                        key=lambda d: d[OutputKeys.SCORE],
+                        reverse=True)))
+
+        return {OutputKeys.OUTPUT: predictions}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 0328b91a..ce9df454 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -21,7 +21,8 @@ if TYPE_CHECKING:
                       SingleSentenceClassificationPreprocessor,
                       PairSentenceClassificationPreprocessor,
                       FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
-                      NERPreprocessor, TextErrorCorrectionPreprocessor)
+                      NERPreprocessor, TextErrorCorrectionPreprocessor,
+                      FaqQuestionAnsweringPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -48,7 +49,8 @@ else:
             'SingleSentenceClassificationPreprocessor',
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'TextErrorCorrectionPreprocessor'
+            'TextErrorCorrectionPreprocessor',
+            'FaqQuestionAnsweringPreprocessor'
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 222a219a..094cbfe2 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,10 +5,12 @@ import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
+import torch
 from transformers import AutoTokenizer
 
 from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
+from modelscope.utils.config import ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.type_assert import type_assert
@@ -21,7 +23,7 @@ __all__ = [
     'PairSentenceClassificationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'TextErrorCorrectionPreprocessor'
+    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor'
 ]
 
 
@@ -645,3 +647,86 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         sample = dict()
         sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
         return sample
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
+class FaqQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super(FaqQuestionAnsweringPreprocessor, self).__init__(
+            model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs)
+        import os
+        from transformers import BertTokenizer
+
+        from modelscope.utils.config import Config
+        from modelscope.utils.constant import ModelFile
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        preprocessor_config = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
+                ConfigFields.preprocessor, {})
+        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        self.label_dict = None
+
+    def pad(self, samples, max_len):
+        result = []
+        for sample in samples:
+            pad_len = max_len - len(sample[:max_len])
+            result.append(sample[:max_len]
+                          + [self.tokenizer.pad_token_id] * pad_len)
+        return result
+
+    def set_label_dict(self, label_dict):
+        self.label_dict = label_dict
+
+    def get_label(self, label_id):
+        assert self.label_dict is not None and label_id < len(self.label_dict)
+        return self.label_dict[label_id]
+
+    def encode_plus(self, text):
+        return [
+            self.tokenizer.cls_token_id
+        ] + self.tokenizer.convert_tokens_to_ids(
+            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
+        queryset = data['query_set']
+        if not isinstance(queryset, list):
+            queryset = [queryset]
+        supportset = data['support_set']
+        supportset = sorted(supportset, key=lambda d: d['label'])
+
+        queryset_tokenized = [self.encode_plus(text) for text in queryset]
+        supportset_tokenized = [
+            self.encode_plus(item['text']) for item in supportset
+        ]
+
+        max_len = max(
+            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
+        max_len = min(TMP_MAX_LEN, max_len)
+        queryset_padded = self.pad(queryset_tokenized, max_len)
+        supportset_padded = self.pad(supportset_tokenized, max_len)
+
+        supportset_labels_ori = [item['label'] for item in supportset]
+        label_dict = []
+        for label in supportset_labels_ori:
+            if label not in label_dict:
+                label_dict.append(label)
+        self.set_label_dict(label_dict)
+        supportset_labels_ids = [
+            label_dict.index(label) for label in supportset_labels_ori
+        ]
+        return {
+            'query': queryset_padded,
+            'support': supportset_padded,
+            'support_labels': supportset_labels_ids
+        }
+
+    def batch_encode(self, sentence_list: list, max_length=None):
+        if not max_length:
+            max_length = self.MAX_LEN
+        return self.tokenizer.batch_encode_plus(
+            sentence_list, padding=True, max_length=max_length)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4ef34812..52c08594 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -95,6 +95,7 @@ class NLPTasks(object):
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
 
 
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
new file mode 100644
index 00000000..3a87643c
--- /dev/null
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -0,0 +1,85 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaqQuestionAnsweringTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+    param = {
+        'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
+        'support_set': [{
+            'text': '卖品代金券怎么用',
+            'label': '6527856'
+        }, {
+            'text': '怎么使用优惠券',
+            'label': '6527856'
+        }, {
+            'text': '这个可以一起领吗',
+            'label': '1000012000'
+        }, {
+            'text': '付款时送的优惠券哪里领',
+            'label': '1000012000'
+        }, {
+            'text': '购物等级怎么长',
+            'label': '13421097'
+        }, {
+            'text': '购物等级二心',
+            'label': '13421097'
+        }]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(cache_path)
+        model = SbertForFaqQuestionAnswering(cache_path)
+        model.load_checkpoint(cache_path)
+        pipeline_ins = FaqQuestionAnsweringPipeline(
+            model, preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=self.model_id)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        print(pipeline_ins(self.param, max_seq_length=20))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_sentence_embedding(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        sentence_vec = pipeline_ins.get_sentence_embedding(
+            ['今天星期六', '明天星期几明天星期几'])
+        print(np.shape(sentence_vec))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 930d55d9adb6eb25b101fe56d01e719360b27e66 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Fri, 26 Aug 2022 13:58:50 +0800
Subject: [PATCH 013/175] support EasyCV framework and add Segformer model     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9781849

    * support EasyCV
---
 configs/cv/configuration.json                 |   2 +-
 data/test/images/image_segmentation.jpg       |   3 +
 modelscope/fileio/format/json.py              |  10 +-
 modelscope/metainfo.py                        |  16 ++
 modelscope/metrics/builder.py                 |  12 +-
 modelscope/models/cv/easycv_base.py           |  25 ++
 .../image_semantic_segmentation/__init__.py   |   2 +
 .../image_semantic_segmentation/segformer.py  |  16 ++
 .../models/cv/object_detection/__init__.py    |   2 +
 .../models/cv/object_detection/yolox_pai.py   |  16 ++
 modelscope/msdatasets/__init__.py             |   2 +
 modelscope/msdatasets/cv/__init__.py          |   3 +
 .../cv/image_classification/__init__.py       |  20 ++
 .../classification_dataset.py                 |  19 ++
 .../image_semantic_segmentation/__init__.py   |  20 ++
 .../segmentation_dataset.py                   |  21 ++
 .../cv/object_detection/__init__.py           |  22 ++
 .../cv/object_detection/detection_dataset.py  |  49 ++++
 modelscope/pipelines/base.py                  |   6 +-
 modelscope/pipelines/cv/__init__.py           |   4 +-
 .../pipelines/cv/easycv_pipelines/__init__.py |  23 ++
 .../pipelines/cv/easycv_pipelines/base.py     |  95 +++++++
 .../cv/easycv_pipelines/detection_pipeline.py |  23 ++
 .../easycv_pipelines/segmentation_pipeline.py |  23 ++
 modelscope/trainers/easycv/__init__.py        |   0
 modelscope/trainers/easycv/trainer.py         | 175 +++++++++++++
 modelscope/trainers/easycv/utils/__init__.py  |  21 ++
 modelscope/trainers/easycv/utils/hooks.py     |  29 +++
 modelscope/trainers/easycv/utils/metric.py    |  52 ++++
 .../trainers/easycv/utils/register_util.py    |  59 +++++
 modelscope/trainers/hooks/checkpoint_hook.py  |  17 +-
 modelscope/trainers/hooks/logger/base.py      |  16 ++
 modelscope/trainers/trainer.py                |  96 ++++---
 modelscope/utils/ast_utils.py                 |   9 +-
 requirements/cv.txt                           |   2 +-
 requirements/runtime.txt                      |   1 +
 tests/pipelines/easycv_pipelines/__init__.py  |   0
 .../test_segmentation_pipeline.py             |  35 +++
 tests/trainers/easycv/__init__.py             |   0
 tests/trainers/easycv/test_easycv_trainer.py  | 244 ++++++++++++++++++
 tests/trainers/easycv/test_segformer.py       |  99 +++++++
 tests/utils/test_config.py                    |   5 +-
 42 files changed, 1235 insertions(+), 59 deletions(-)
 create mode 100644 data/test/images/image_segmentation.jpg
 create mode 100644 modelscope/models/cv/easycv_base.py
 create mode 100644 modelscope/models/cv/image_semantic_segmentation/segformer.py
 create mode 100644 modelscope/models/cv/object_detection/yolox_pai.py
 create mode 100644 modelscope/msdatasets/cv/__init__.py
 create mode 100644 modelscope/msdatasets/cv/image_classification/__init__.py
 create mode 100644 modelscope/msdatasets/cv/image_classification/classification_dataset.py
 create mode 100644 modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
 create mode 100644 modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
 create mode 100644 modelscope/msdatasets/cv/object_detection/__init__.py
 create mode 100644 modelscope/msdatasets/cv/object_detection/detection_dataset.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/__init__.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/base.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
 create mode 100644 modelscope/trainers/easycv/__init__.py
 create mode 100644 modelscope/trainers/easycv/trainer.py
 create mode 100644 modelscope/trainers/easycv/utils/__init__.py
 create mode 100644 modelscope/trainers/easycv/utils/hooks.py
 create mode 100644 modelscope/trainers/easycv/utils/metric.py
 create mode 100644 modelscope/trainers/easycv/utils/register_util.py
 create mode 100644 tests/pipelines/easycv_pipelines/__init__.py
 create mode 100644 tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
 create mode 100644 tests/trainers/easycv/__init__.py
 create mode 100644 tests/trainers/easycv/test_easycv_trainer.py
 create mode 100644 tests/trainers/easycv/test_segformer.py

diff --git a/configs/cv/configuration.json b/configs/cv/configuration.json
index 2b0da89d..ae07fa10 100644
--- a/configs/cv/configuration.json
+++ b/configs/cv/configuration.json
@@ -2,7 +2,6 @@
     "framework": "pytorch",
 
     "task": "image_classification",
-    "work_dir": "./work_dir",
 
     "model": {
         "type": "classification",
@@ -119,6 +118,7 @@
     },
 
     "train": {
+        "work_dir": "./work_dir",
         "dataloader": {
             "batch_size_per_gpu": 2,
             "workers_per_gpu": 1
diff --git a/data/test/images/image_segmentation.jpg b/data/test/images/image_segmentation.jpg
new file mode 100644
index 00000000..a9c0875c
--- /dev/null
+++ b/data/test/images/image_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
+size 146140
diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py
index 977a8b8c..f615366f 100644
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
+import jsonplus
 import numpy as np
 
 from .base import FormatHandler
@@ -22,14 +22,14 @@ def set_default(obj):
 
 
 class JsonHandler(FormatHandler):
+    """Use jsonplus, serialization of Python types to JSON that "just works"."""
 
     def load(self, file):
-        return json.load(file)
+        return jsonplus.loads(file.read())
 
     def dump(self, obj, file, **kwargs):
-        kwargs.setdefault('default', set_default)
-        json.dump(obj, file, **kwargs)
+        file.write(self.dumps(obj, **kwargs))
 
     def dumps(self, obj, **kwargs):
         kwargs.setdefault('default', set_default)
-        return json.dumps(obj, **kwargs)
+        return jsonplus.dumps(obj, **kwargs)
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 6ea03610..24f2f748 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -26,6 +26,10 @@ class Models(object):
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
 
+    # EasyCV models
+    yolox = 'YOLOX'
+    segformer = 'Segformer'
+
     # nlp models
     bert = 'bert'
     palm = 'palm-v2'
@@ -92,6 +96,8 @@ class Pipelines(object):
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
+    easycv_detection = 'easycv-detection'
+    easycv_segmentation = 'easycv-segmentation'
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -171,6 +177,7 @@ class Trainers(object):
     """
 
     default = 'trainer'
+    easycv = 'easycv'
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
@@ -307,3 +314,12 @@ class LR_Schedulers(object):
     LinearWarmup = 'LinearWarmup'
     ConstantWarmup = 'ConstantWarmup'
     ExponentialWarmup = 'ExponentialWarmup'
+
+
+class Datasets(object):
+    """ Names for different datasets.
+    """
+    ClsDataset = 'ClsDataset'
+    SegDataset = 'SegDataset'
+    DetDataset = 'DetDataset'
+    DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index ad41fd87..9ba80a6c 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict, Mapping, Union
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.config import ConfigDict
@@ -35,16 +36,19 @@ task_default_metrics = {
 }
 
 
-def build_metric(metric_name: str,
+def build_metric(metric_cfg: Union[str, Dict],
                  field: str = default_group,
                  default_args: dict = None):
     """ Build metric given metric_name and field.
 
     Args:
-        metric_name (:obj:`str`): The metric name.
+        metric_name (str | dict): The metric name or metric config dict.
         field (str, optional):  The field of this metric, default value: 'default' for all fields.
         default_args (dict, optional): Default initialization arguments.
     """
-    cfg = ConfigDict({'type': metric_name})
+    if isinstance(metric_cfg, Mapping):
+        assert 'type' in metric_cfg
+    else:
+        metric_cfg = ConfigDict({'type': metric_cfg})
     return build_from_cfg(
-        cfg, METRICS, group_key=field, default_args=default_args)
+        metric_cfg, METRICS, group_key=field, default_args=default_args)
diff --git a/modelscope/models/cv/easycv_base.py b/modelscope/models/cv/easycv_base.py
new file mode 100644
index 00000000..7bc35e84
--- /dev/null
+++ b/modelscope/models/cv/easycv_base.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.base import BaseModel
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.models.base import TorchModel
+
+
+class EasyCVBaseModel(BaseModel, TorchModel):
+    """Base model for EasyCV."""
+
+    def __init__(self, model_dir=None, args=(), kwargs={}):
+        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
+        BaseModel.__init__(self)
+        TorchModel.__init__(self, model_dir=model_dir)
+
+    def forward(self, img, mode='train', **kwargs):
+        if self.training:
+            losses = self.forward_train(img, **kwargs)
+            loss, log_vars = self._parse_losses(losses)
+            return dict(loss=loss, log_vars=log_vars)
+        else:
+            return self.forward_test(img, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
diff --git a/modelscope/models/cv/image_semantic_segmentation/__init__.py b/modelscope/models/cv/image_semantic_segmentation/__init__.py
index 598d7c21..df56c5b8 100644
--- a/modelscope/models/cv/image_semantic_segmentation/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .semantic_seg_model import SemanticSegmentation
+    from .segformer import Segformer
 
 else:
     _import_structure = {
         'semantic_seg_model': ['SemanticSegmentation'],
+        'segformer': ['Segformer']
     }
 
     import sys
diff --git a/modelscope/models/cv/image_semantic_segmentation/segformer.py b/modelscope/models/cv/image_semantic_segmentation/segformer.py
new file mode 100644
index 00000000..46303526
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.segmentation import EncoderDecoder
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Models.segformer)
+class Segformer(EasyCVBaseModel, EncoderDecoder):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        EncoderDecoder.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index fa73686d..974375ce 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .mmdet_model import DetectionModel
+    from .yolox_pai import YOLOX
 
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
+        'yolox_pai': ['YOLOX']
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py
new file mode 100644
index 00000000..985cc136
--- /dev/null
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.detection.detectors import YOLOX as _YOLOX
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Models.yolox)
+class YOLOX(EasyCVBaseModel, _YOLOX):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        _YOLOX.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py
index 8e0647bb..073f9396 100644
--- a/modelscope/msdatasets/__init__.py
+++ b/modelscope/msdatasets/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import cv
 from .ms_dataset import MsDataset
diff --git a/modelscope/msdatasets/cv/__init__.py b/modelscope/msdatasets/cv/__init__.py
new file mode 100644
index 00000000..fad91bcf
--- /dev/null
+++ b/modelscope/msdatasets/cv/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import (image_classification, image_semantic_segmentation,
+               object_detection)
diff --git a/modelscope/msdatasets/cv/image_classification/__init__.py b/modelscope/msdatasets/cv/image_classification/__init__.py
new file mode 100644
index 00000000..95e8d7a1
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .classification_dataset import ClsDataset
+
+else:
+    _import_structure = {'classification_dataset': ['ClsDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
new file mode 100644
index 00000000..c7145f2b
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.classification import ClsDataset as _ClsDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_classification, module_name=Datasets.ClsDataset)
+class ClsDataset(_ClsDataset):
+    """EasyCV dataset for classification.
+    For more details, please refer to :
+    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/classification/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Sequence of transform object or config dict to be composed.
+    """
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..26121bdb
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .segmentation_dataset import SegDataset
+
+else:
+    _import_structure = {'easycv_segmentation': ['SegDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
new file mode 100644
index 00000000..21114c11
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.segmentation import SegDataset as _SegDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
+class SegDataset(_SegDataset):
+    """EasyCV dataset for Sementic segmentation.
+    For more details, please refer to :
+    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Sequence of transform object or config dict to be composed.
+        ignore_index (int): Label index to be ignored.
+        profiling: If set True, will print transform time.
+    """
diff --git a/modelscope/msdatasets/cv/object_detection/__init__.py b/modelscope/msdatasets/cv/object_detection/__init__.py
new file mode 100644
index 00000000..30af2d9b
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .easycv_detection import DetDataset, DetImagesMixDataset
+
+else:
+    _import_structure = {
+        'easycv_detection': ['DetDataset', 'DetImagesMixDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
new file mode 100644
index 00000000..5b130a3e
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.detection import DetDataset as _DetDataset
+from easycv.datasets.detection import \
+    DetImagesMixDataset as _DetImagesMixDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
+class DetDataset(_DetDataset):
+    """EasyCV dataset for object detection.
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Transform config list
+        profiling: If set True, will print pipeline time
+        classes: A list of class names, used in evaluation for result and groundtruth visualization
+    """
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Datasets.DetImagesMixDataset)
+class DetImagesMixDataset(_DetImagesMixDataset):
+    """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+    output boxes format: cx, cy, w, h
+
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/mix.py .
+
+    Args:
+        data_source (:obj:`DetSourceCoco`): Data source config to parse input data.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        label_padding: out labeling padding [N, 120, 5]
+    """
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 180ad757..c0f3cbd0 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -240,9 +240,9 @@ class Pipeline(ABC):
                 raise ValueError(f'Unsupported data type {type(data)}')
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
-        preprocess_params = kwargs.get('preprocess_params')
-        forward_params = kwargs.get('forward_params')
-        postprocess_params = kwargs.get('postprocess_params')
+        preprocess_params = kwargs.get('preprocess_params', {})
+        forward_params = kwargs.get('forward_params', {})
+        postprocess_params = kwargs.get('postprocess_params', {})
 
         out = self.preprocess(input, **preprocess_params)
         with device_placement(self.framework, self.device_name):
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index f4b4ae3e..b1a513e5 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -39,7 +39,7 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
-
+    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -84,6 +84,8 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
+        'easycv_pipeline':
+        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..0984ff43
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .detection_pipeline import EasyCVDetectionPipeline
+    from .segmentation_pipeline import EasyCVSegmentationPipeline
+else:
+    _import_structure = {
+        'detection_pipeline': ['EasyCVDetectionPipeline'],
+        'segmentation_pipeline': ['EasyCVSegmentationPipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
new file mode 100644
index 00000000..d6495f0a
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import os.path as osp
+from typing import Any
+
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines.util import is_official_hub_path
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+
+
+class EasyCVPipeline(object):
+    """Base pipeline for EasyCV.
+    Loading configuration file of modelscope style by default,
+    but it is actually use the predictor api of easycv to predict.
+    So here we do some adaptation work for configuration and predict api.
+    """
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+
+        """
+        self.model_file_pattern = model_file_pattern
+
+        assert isinstance(model, str)
+        if osp.exists(model):
+            model_dir = model
+        else:
+            assert is_official_hub_path(
+                model), 'Only support local model path and official hub path!'
+            model_dir = snapshot_download(
+                model_id=model, revision=DEFAULT_MODEL_REVISION)
+
+        assert osp.isdir(model_dir)
+        model_files = glob.glob(
+            os.path.join(model_dir, self.model_file_pattern))
+        assert len(
+            model_files
+        ) == 1, f'Need one model file, but find {len(model_files)}: {model_files}'
+
+        model_path = model_files[0]
+        self.model_path = model_path
+
+        # get configuration file from source model dir
+        self.config_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        assert os.path.exists(
+            self.config_file
+        ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'
+
+        self.cfg = Config.from_file(self.config_file)
+        self.predict_op = self._build_predict_op()
+
+    def _build_predict_op(self):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config
+        })
+        return pipeline_op
+
+    def _to_easycv_config(self):
+        """Adapt to EasyCV predictor."""
+        # TODO: refine config compatibility problems
+
+        easycv_arch = self.cfg.model.pop(EasyCVMeta.ARCH, None)
+        model_cfg = self.cfg.model
+        # Revert to the configuration of easycv
+        if easycv_arch is not None:
+            model_cfg.update(easycv_arch)
+
+        easycv_config = Config(dict(model=model_cfg))
+
+        reserved_keys = []
+        if hasattr(self.cfg, EasyCVMeta.META):
+            easycv_meta_cfg = getattr(self.cfg, EasyCVMeta.META)
+            reserved_keys = easycv_meta_cfg.get(EasyCVMeta.RESERVED_KEYS, [])
+            for key in reserved_keys:
+                easycv_config.merge_from_dict({key: getattr(self.cfg, key)})
+        if 'test_pipeline' not in reserved_keys:
+            easycv_config.merge_from_dict(
+                {'test_pipeline': self.cfg.dataset.val.get('pipeline', [])})
+
+        return easycv_config
+
+    def __call__(self, inputs) -> Any:
+        # TODO: support image url
+        return self.predict_op(inputs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
new file mode 100644
index 00000000..32365102
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.easycv_detection)
+class EasyCVDetectionPipeline(EasyCVPipeline):
+    """Pipeline for easycv detection task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVDetectionPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
new file mode 100644
index 00000000..2182e3b3
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation, module_name=Pipelines.easycv_segmentation)
+class EasyCVSegmentationPipeline(EasyCVPipeline):
+    """Pipeline for easycv segmentation task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVSegmentationPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
diff --git a/modelscope/trainers/easycv/__init__.py b/modelscope/trainers/easycv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/easycv/trainer.py b/modelscope/trainers/easycv/trainer.py
new file mode 100644
index 00000000..dee06a41
--- /dev/null
+++ b/modelscope/trainers/easycv/trainer.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.base import TRAINERS
+from modelscope.trainers.easycv.utils import register_util
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import build_parallel
+from modelscope.trainers.parallel.utils import is_parallel
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.import_utils import LazyImportModule
+from modelscope.utils.registry import default_group
+
+
+@TRAINERS.register_module(module_name=Trainers.easycv)
+class EasyCVEpochBasedTrainer(EpochBasedTrainer):
+    """Epoch based Trainer for EasyCV.
+
+    Args:
+        task: Task name.
+        cfg_file(str): The config file of EasyCV.
+        model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
+            or a model id. If model is None, build_model method will be called.
+        train_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*):
+            The dataset to use for training.
+            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
+            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
+            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
+            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
+            sets the seed of the RNGs used.
+        eval_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation.
+        preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor.
+            NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code,
+            this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
+            Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
+            this preprocessing action will be executed every time the dataset's __getitem__ is called.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use.
+        max_epochs: (int, optional): Total training epochs.
+    """
+
+    def __init__(
+            self,
+            task: str,
+            cfg_file: Optional[str] = None,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        self.task = task
+        register_util.register_parallel()
+        register_util.register_part_mmcv_hooks_to_ms()
+
+        super(EasyCVEpochBasedTrainer, self).__init__(
+            model=model,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            model_revision=model_revision,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            **kwargs)
+
+        # reset data_collator
+        from mmcv.parallel import collate
+
+        self.train_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.train.dataloader.batch_size_per_gpu)
+        self.eval_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.evaluation.dataloader.batch_size_per_gpu)
+
+        # Register easycv hooks dynamicly. If the hook already exists in modelscope,
+        # the hook in modelscope will be used, otherwise register easycv hook into ms.
+        # We must manually trigger lazy import to detect whether the hook is in modelscope.
+        # TODO: use ast index to detect whether the hook is in modelscope
+        for h_i in self.cfg.train.get('hooks', []):
+            sig = ('HOOKS', default_group, h_i['type'])
+            LazyImportModule.import_module(sig)
+            if h_i['type'] not in HOOKS._modules[default_group]:
+                if h_i['type'] in [
+                        'TensorboardLoggerHookV2', 'WandbLoggerHookV2'
+                ]:
+                    raise ValueError(
+                        'Not support hook %s now, we will support it in the future!'
+                        % h_i['type'])
+                register_util.register_hook_to_ms(h_i['type'], self.logger)
+
+        # reset parallel
+        if not self._dist:
+            assert not is_parallel(
+                self.model
+            ), 'Not support model wrapped by custom parallel if not in distributed mode!'
+            dp_cfg = dict(
+                type='MMDataParallel',
+                module=self.model,
+                device_ids=[torch.cuda.current_device()])
+            self.model = build_parallel(dp_cfg)
+
+    def create_optimizer_and_scheduler(self):
+        """ Create optimizer and lr scheduler
+        """
+        optimizer, lr_scheduler = self.optimizers
+        if optimizer is None:
+            optimizer_cfg = self.cfg.train.get('optimizer', None)
+        else:
+            optimizer_cfg = None
+
+        optim_options = {}
+        if optimizer_cfg is not None:
+            optim_options = optimizer_cfg.pop('options', {})
+            from easycv.apis.train import build_optimizer
+            optimizer = build_optimizer(self.model, optimizer_cfg)
+
+        if lr_scheduler is None:
+            lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None)
+        else:
+            lr_scheduler_cfg = None
+
+        lr_options = {}
+        # Adapt to mmcv lr scheduler hook.
+        # Please refer to: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+        if lr_scheduler_cfg is not None:
+            assert optimizer is not None
+            lr_options = lr_scheduler_cfg.pop('options', {})
+            assert 'policy' in lr_scheduler_cfg
+            policy_type = lr_scheduler_cfg.pop('policy')
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_scheduler_cfg['type'] = hook_type
+
+            self.cfg.train.lr_scheduler_hook = lr_scheduler_cfg
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+        return self.optimizer, self.lr_scheduler, optim_options, lr_options
+
+    def to_parallel(self, model) -> Union[nn.Module, TorchModel]:
+        if self.cfg.get('parallel', None) is not None:
+            self.cfg.parallel.update(
+                dict(module=model, device_ids=[torch.cuda.current_device()]))
+            return build_parallel(self.cfg.parallel)
+
+        dp_cfg = dict(
+            type='MMDistributedDataParallel',
+            module=model,
+            device_ids=[torch.cuda.current_device()])
+
+        return build_parallel(dp_cfg)
+
+    def rebuild_config(self, cfg: Config):
+        cfg.task = self.task
+
+        return cfg
diff --git a/modelscope/trainers/easycv/utils/__init__.py b/modelscope/trainers/easycv/utils/__init__.py
new file mode 100644
index 00000000..23cfa36a
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hooks import AddLrLogHook
+    from .metric import EasyCVMetric
+
+else:
+    _import_structure = {'hooks': ['AddLrLogHook'], 'metric': ['EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/easycv/utils/hooks.py b/modelscope/trainers/easycv/utils/hooks.py
new file mode 100644
index 00000000..62bc6d1e
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/hooks.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.trainers.hooks import HOOKS, Priority
+from modelscope.trainers.hooks.lr_scheduler_hook import LrSchedulerHook
+from modelscope.utils.constant import LogKeys
+
+
+@HOOKS.register_module(module_name='AddLrLogHook')
+class AddLrLogHook(LrSchedulerHook):
+    """For EasyCV to adapt to ModelScope, the lr log of EasyCV is added in the trainer,
+    but the trainer of ModelScope does not and it is added in the lr scheduler hook.
+    But The lr scheduler hook used by EasyCV is the hook of mmcv, and there is no lr log.
+    It will be deleted in the future.
+    """
+    PRIORITY = Priority.NORMAL
+
+    def __init__(self):
+        pass
+
+    def before_run(self, trainer):
+        pass
+
+    def before_train_iter(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def before_train_epoch(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def after_train_epoch(self, trainer):
+        pass
diff --git a/modelscope/trainers/easycv/utils/metric.py b/modelscope/trainers/easycv/utils/metric.py
new file mode 100644
index 00000000..53937b67
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/metric.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+from typing import Dict
+
+import numpy as np
+import torch
+
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS
+
+
+@METRICS.register_module(module_name='EasyCVMetric')
+class EasyCVMetric(Metric):
+    """Adapt to ModelScope Metric for EasyCV evaluator.
+    """
+
+    def __init__(self, trainer=None, evaluators=None, *args, **kwargs):
+        from easycv.core.evaluation.builder import build_evaluator
+
+        self.trainer = trainer
+        self.evaluators = build_evaluator(evaluators)
+        self.preds = []
+        self.grountruths = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.preds.append(outputs)
+        del inputs
+
+    def evaluate(self):
+        results = {}
+        for _, batch in enumerate(self.preds):
+            for k, v in batch.items():
+                if k not in results:
+                    results[k] = []
+                results[k].append(v)
+
+        for k, v in results.items():
+            if len(v) == 0:
+                raise ValueError(f'empty result for {k}')
+
+            if isinstance(v[0], torch.Tensor):
+                results[k] = torch.cat(v, 0)
+            elif isinstance(v[0], (list, np.ndarray)):
+                results[k] = list(itertools.chain.from_iterable(v))
+            else:
+                raise ValueError(
+                    f'value of batch prediction dict should only be tensor or list, {k} type is {v[0]}'
+                )
+
+        metric_values = self.trainer.eval_dataset.evaluate(
+            results, self.evaluators)
+        return metric_values
diff --git a/modelscope/trainers/easycv/utils/register_util.py b/modelscope/trainers/easycv/utils/register_util.py
new file mode 100644
index 00000000..f80eaace
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/register_util.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+import logging
+
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import PARALLEL
+
+
+def register_parallel():
+    from mmcv.parallel import MMDistributedDataParallel, MMDataParallel
+
+    PARALLEL.register_module(
+        module_name='MMDistributedDataParallel',
+        module_cls=MMDistributedDataParallel)
+    PARALLEL.register_module(
+        module_name='MMDataParallel', module_cls=MMDataParallel)
+
+
+def register_hook_to_ms(hook_name, logger=None):
+    """Register EasyCV hook to ModelScope."""
+    from easycv.hooks import HOOKS as _EV_HOOKS
+
+    if hook_name not in _EV_HOOKS._module_dict:
+        raise ValueError(
+            f'Not found hook "{hook_name}" in EasyCV hook registries!')
+
+    obj = _EV_HOOKS._module_dict[hook_name]
+    HOOKS.register_module(module_name=hook_name, module_cls=obj)
+
+    log_str = f'Register hook "{hook_name}" to modelscope hooks.'
+    logger.info(log_str) if logger is not None else logging.info(log_str)
+
+
+def register_part_mmcv_hooks_to_ms():
+    """Register required mmcv hooks to ModelScope.
+    Currently we only registered all lr scheduler hooks in EasyCV and mmcv.
+    Please refer to:
+        EasyCV: https://github.com/alibaba/EasyCV/blob/master/easycv/hooks/lr_update_hook.py
+        mmcv: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+    """
+    from mmcv.runner.hooks import lr_updater
+    from mmcv.runner.hooks import HOOKS as _MMCV_HOOKS
+    from easycv.hooks import StepFixCosineAnnealingLrUpdaterHook, YOLOXLrUpdaterHook
+    from easycv.hooks.logger import PreLoggerHook
+
+    mmcv_hooks_in_easycv = [('StepFixCosineAnnealingLrUpdaterHook',
+                             StepFixCosineAnnealingLrUpdaterHook),
+                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook),
+                            ('PreLoggerHook', PreLoggerHook)]
+
+    members = inspect.getmembers(lr_updater)
+    members.extend(mmcv_hooks_in_easycv)
+
+    for name, obj in members:
+        if name in _MMCV_HOOKS._module_dict:
+            HOOKS.register_module(
+                module_name=name,
+                module_cls=obj,
+            )
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index 623d4654..cf7a0f7a 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -81,12 +81,19 @@ class CheckpointHook(Hook):
         if self.is_last_epoch(trainer) and self.by_epoch:
             output_dir = os.path.join(self.save_dir,
                                       ModelFile.TRAIN_OUTPUT_DIR)
+            from modelscope.trainers.parallel.utils import is_parallel
 
-            trainer.model.save_pretrained(
-                output_dir,
-                ModelFile.TORCH_MODEL_BIN_FILE,
-                save_function=save_checkpoint,
-                config=trainer.cfg.to_dict())
+            if is_parallel(trainer.model):
+                model = trainer.model.module
+            else:
+                model = trainer.model
+
+            if hasattr(model, 'save_pretrained'):
+                model.save_pretrained(
+                    output_dir,
+                    ModelFile.TORCH_MODEL_BIN_FILE,
+                    save_function=save_checkpoint,
+                    config=trainer.cfg.to_dict())
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
diff --git a/modelscope/trainers/hooks/logger/base.py b/modelscope/trainers/hooks/logger/base.py
index e1da251f..684c4a8c 100644
--- a/modelscope/trainers/hooks/logger/base.py
+++ b/modelscope/trainers/hooks/logger/base.py
@@ -60,6 +60,18 @@ class LoggerHook(Hook):
         else:
             return False
 
+    def fetch_tensor(self, trainer, n=0):
+        """Fetch latest n values or all values, process tensor type, convert to numpy for dump logs."""
+        assert n >= 0
+        for key in trainer.log_buffer.val_history:
+            values = trainer.log_buffer.val_history[key][-n:]
+
+            for i, v in enumerate(values):
+                if isinstance(v, torch.Tensor):
+                    values[i] = v.clone().detach().cpu().numpy()
+
+            trainer.log_buffer.val_history[key][-n:] = values
+
     def get_epoch(self, trainer):
         if trainer.mode in [ModeKeys.TRAIN, ModeKeys.EVAL]:
             epoch = trainer.epoch + 1
@@ -88,11 +100,14 @@ class LoggerHook(Hook):
 
     def after_train_iter(self, trainer):
         if self.by_epoch and self.every_n_inner_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif not self.by_epoch and self.every_n_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif self.end_of_epoch(trainer) and not self.ignore_last:
             # not precise but more stable
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
 
         if trainer.log_buffer.ready:
@@ -107,6 +122,7 @@ class LoggerHook(Hook):
                 trainer.log_buffer.clear_output()
 
     def after_val_epoch(self, trainer):
+        self.fetch_tensor(trainer)
         trainer.log_buffer.average()
         self.log(trainer)
         if self.reset_flag:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index c48ab2cd..dc8c5c09 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -26,7 +26,6 @@ from modelscope.msdatasets.task_datasets.torch_base_dataset import \
     TorchTaskDataset
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import build_preprocessor
-from modelscope.preprocessors.common import Compose
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -83,7 +82,8 @@ class EpochBasedTrainer(BaseTrainer):
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             cfg_file: Optional[str] = None,
             arg_parse_fn: Optional[Callable] = None,
-            data_collator: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
             train_dataset: Optional[Union[MsDataset, Dataset]] = None,
             eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
             preprocessor: Optional[Union[Preprocessor,
@@ -104,21 +104,24 @@ class EpochBasedTrainer(BaseTrainer):
             if cfg_file is None:
                 cfg_file = os.path.join(self.model_dir,
                                         ModelFile.CONFIGURATION)
-            self.model = self.build_model()
         else:
-            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
-            assert isinstance(
-                model,
-                (TorchModel, nn.Module
-                 )), 'model should be either str, TorchMode or nn.Module.'
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
             self.model_dir = os.path.dirname(cfg_file)
-            self.model = model
 
         super().__init__(cfg_file, arg_parse_fn)
+
         # add default config
         self.cfg.merge_from_dict(self._get_default_config(), force=False)
         self.cfg = self.rebuild_config(self.cfg)
 
+        if 'cfg_options' in kwargs:
+            self.cfg.merge_from_dict(kwargs['cfg_options'])
+
+        if isinstance(model, (TorchModel, nn.Module)):
+            self.model = model
+        else:
+            self.model = self.build_model()
+
         if 'work_dir' in kwargs:
             self.work_dir = kwargs['work_dir']
         else:
@@ -162,7 +165,24 @@ class EpochBasedTrainer(BaseTrainer):
             mode=ModeKeys.EVAL,
             preprocessor=self.eval_preprocessor)
 
-        self.data_collator = data_collator if data_collator is not None else default_collate
+        self.train_data_collator, self.eval_default_collate = None, None
+        if isinstance(data_collator, Mapping):
+            if not (ConfigKeys.train in data_collator
+                    or ConfigKeys.val in data_collator):
+                raise ValueError(
+                    f'data_collator must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
+                )
+            if ConfigKeys.train in data_collator:
+                assert isinstance(data_collator[ConfigKeys.train], Callable)
+                self.train_data_collator = data_collator[ConfigKeys.train]
+            if ConfigKeys.val in data_collator:
+                assert isinstance(data_collator[ConfigKeys.val], Callable)
+                self.eval_data_collator = data_collator[ConfigKeys.val]
+        else:
+            collate_fn = default_collate if data_collator is None else data_collator
+            self.train_data_collator = collate_fn
+            self.eval_data_collator = collate_fn
+
         self.metrics = self.get_metrics()
         self._metric_values = None
         self.optimizers = optimizers
@@ -364,7 +384,7 @@ class EpochBasedTrainer(BaseTrainer):
 
         return train_preprocessor, eval_preprocessor
 
-    def get_metrics(self) -> List[str]:
+    def get_metrics(self) -> List[Union[str, Dict]]:
         """Get the metric class types.
 
         The first choice will be the metrics configured in the config file, if not found, the default metrics will be
@@ -384,7 +404,7 @@ class EpochBasedTrainer(BaseTrainer):
                 f'Metrics are needed in evaluation, please try to either '
                 f'add metrics in configuration.json or add the default metric for {self.cfg.task}.'
             )
-        if isinstance(metrics, str):
+        if isinstance(metrics, (str, Mapping)):
             metrics = [metrics]
         return metrics
 
@@ -399,6 +419,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.train_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.train_data_collator,
                 **self.cfg.train.get('dataloader', {}))
         self.data_loader = self.train_dataloader
 
@@ -418,6 +439,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.eval_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.eval_data_collator,
                 **self.cfg.evaluation.get('dataloader', {}))
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
@@ -440,7 +462,7 @@ class EpochBasedTrainer(BaseTrainer):
         override this method in a subclass.
 
         """
-        model = Model.from_pretrained(self.model_dir)
+        model = Model.from_pretrained(self.model_dir, cfg_dict=self.cfg)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
         elif isinstance(model, nn.Module):
@@ -552,6 +574,7 @@ class EpochBasedTrainer(BaseTrainer):
             self.train_dataset,
             dist=self._dist,
             seed=self._seed,
+            collate_fn=self.train_data_collator,
             **self.cfg.train.get('dataloader', {}))
         return data_loader
 
@@ -569,9 +592,9 @@ class EpochBasedTrainer(BaseTrainer):
                 mode=ModeKeys.EVAL,
                 preprocessor=self.eval_preprocessor)
 
-        batch_size = self.cfg.evaluation.batch_size
-        workers = self.cfg.evaluation.workers
-        shuffle = self.cfg.evaluation.get('shuffle', False)
+        batch_size = self.cfg.evaluation.dataloader.batch_size_per_gpu
+        workers = self.cfg.evaluation.dataloader.workers_per_gpu
+        shuffle = self.cfg.evaluation.dataloader.get('shuffle', False)
         data_loader = self._build_dataloader_with_dataset(
             self.eval_dataset,
             batch_size_per_gpu=batch_size,
@@ -580,25 +603,31 @@ class EpochBasedTrainer(BaseTrainer):
             dist=self._dist,
             seed=self._seed,
             persistent_workers=True,
+            collate_fn=self.eval_data_collator,
         )
         return data_loader
 
     def build_dataset(self, data_cfg, mode, preprocessor=None):
         """ Build torch dataset object using data config
         """
-        dataset = MsDataset.load(
-            dataset_name=data_cfg.name,
-            split=data_cfg.split,
-            subset_name=data_cfg.subset_name if hasattr(
-                data_cfg, 'subset_name') else None,
-            hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
-            **data_cfg,
-        )
-        cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
-        torch_dataset = dataset.to_torch_dataset(
-            task_data_config=cfg,
-            task_name=self.cfg.task,
-            preprocessors=self.preprocessor)
+        # TODO: support MsDataset load for cv
+        if hasattr(data_cfg, 'name'):
+            dataset = MsDataset.load(
+                dataset_name=data_cfg.name,
+                split=data_cfg.split,
+                subset_name=data_cfg.subset_name if hasattr(
+                    data_cfg, 'subset_name') else None,
+                hub=data_cfg.hub
+                if hasattr(data_cfg, 'hub') else Hubs.modelscope,
+                **data_cfg,
+            )
+            cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
+            torch_dataset = dataset.to_torch_dataset(
+                task_data_config=cfg,
+                task_name=self.cfg.task,
+                preprocessors=self.preprocessor)
+        else:
+            torch_dataset = build_task_dataset(data_cfg, self.cfg.task)
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
@@ -746,7 +775,6 @@ class EpochBasedTrainer(BaseTrainer):
             sampler=sampler,
             num_workers=num_workers,
             batch_sampler=batch_sampler,
-            collate_fn=self.data_collator,
             pin_memory=kwargs.pop('pin_memory', False),
             worker_init_fn=init_fn,
             **kwargs)
@@ -820,12 +848,14 @@ class EpochBasedTrainer(BaseTrainer):
         Args:
             hook (:obj:`Hook`): The hook to be registered.
         """
-        assert isinstance(hook, Hook)
         # insert the hook to a sorted list
         inserted = False
         for i in range(len(self._hooks) - 1, -1, -1):
-            if get_priority(hook.PRIORITY) > get_priority(
-                    self._hooks[i].PRIORITY):
+            p = hook.PRIORITY if hasattr(hook, 'PRIORITY') else Priority.NORMAL
+            p_i = self._hooks[i].PRIORITY if hasattr(
+                self._hooks[i], 'PRIORITY') else Priority.NORMAL
+
+            if get_priority(p) > get_priority(p_i):
                 self._hooks.insert(i + 1, hook)
                 inserted = True
                 break
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 2d2f61d8..990a9571 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -15,9 +15,9 @@ import json
 
 from modelscope import __version__
 from modelscope.fileio.file import LocalStorage
-from modelscope.metainfo import (Heads, Hooks, LR_Schedulers, Metrics, Models,
-                                 Optimizers, Pipelines, Preprocessors,
-                                 TaskModels, Trainers)
+from modelscope.metainfo import (Datasets, Heads, Hooks, LR_Schedulers,
+                                 Metrics, Models, Optimizers, Pipelines,
+                                 Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
@@ -32,8 +32,7 @@ MODELSCOPE_PATH = p.resolve().parents[1]
 REGISTER_MODULE = 'register_module'
 IGNORED_PACKAGES = ['modelscope', '.']
 SCAN_SUB_FOLDERS = [
-    'models', 'metrics', 'pipelines', 'preprocessors',
-    'msdatasets/task_datasets', 'trainers'
+    'models', 'metrics', 'pipelines', 'preprocessors', 'trainers', 'msdatasets'
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 8dcf6791..b7b3e4e8 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,7 +14,7 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.5
+pai-easycv>=0.6.0
 pandas
 psutil
 regex
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index c059b4ba..b51faeda 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -4,6 +4,7 @@ easydict
 einops
 filelock>=3.3.0
 gast>=0.2.2
+jsonplus
 numpy
 opencv-python
 oss2
diff --git a/tests/pipelines/easycv_pipelines/__init__.py b/tests/pipelines/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
new file mode 100644
index 00000000..0eca2a7f
--- /dev/null
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVSegmentationPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b0(self):
+        img_path = 'data/test/images/image_segmentation.jpg'
+        model_id = 'EasyCV/EasyCV-Segformer-b0'
+        img = np.asarray(Image.open(img_path))
+
+        object_detect = pipeline(task=Tasks.image_segmentation, model=model_id)
+        outputs = object_detect(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertListEqual(
+            list(img.shape)[:2], list(results['seg_pred'][0].shape))
+        self.assertListEqual(results['seg_pred'][0][1, :10].tolist(),
+                             [161 for i in range(10)])
+        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
+                             [133 for i in range(10)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/__init__.py b/tests/trainers/easycv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
new file mode 100644
index 00000000..6d1d7ec4
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -0,0 +1,244 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+import requests
+import torch
+
+from modelscope.metainfo import Models, Pipelines, Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import LogKeys, ModeKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+from modelscope.utils.torch_utils import is_master
+
+
+def _download_data(url, save_dir):
+    r = requests.get(url, verify=True)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    zip_name = os.path.split(url)[-1]
+    save_path = os.path.join(save_dir, zip_name)
+    with open(save_path, 'wb') as f:
+        f.write(r.content)
+
+    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
+    shutil.unpack_archive(save_path, unpack_dir)
+
+
+def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
+    import easycv
+    config_path = os.path.join(
+        os.path.dirname(easycv.__file__),
+        'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
+
+    data_dir = os.path.join(work_dir, 'small_coco_test')
+    url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco.zip'
+    if is_master():
+        _download_data(url, data_dir)
+
+    import time
+    time.sleep(1)
+    cfg = Config.from_file(config_path)
+
+    cfg.work_dir = work_dir
+    cfg.total_epochs = 2
+    cfg.checkpoint_config.interval = 1
+    cfg.eval_config.interval = 1
+    cfg.log_config = dict(
+        interval=log_config,
+        hooks=[
+            dict(type='TextLoggerHook'),
+            dict(type='TensorboardLoggerHook')
+        ])
+    cfg.data.train.data_source.ann_file = os.path.join(
+        data_dir, 'small_coco/small_coco/instances_train2017_20.json')
+    cfg.data.train.data_source.img_prefix = os.path.join(
+        data_dir, 'small_coco/small_coco/train2017')
+    cfg.data.val.data_source.ann_file = os.path.join(
+        data_dir, 'small_coco/small_coco/instances_val2017_20.json')
+    cfg.data.val.data_source.img_prefix = os.path.join(
+        data_dir, 'small_coco/small_coco/val2017')
+    cfg.data.imgs_per_gpu = imgs_per_gpu
+    cfg.data.workers_per_gpu = 2
+    cfg.data.val.imgs_per_gpu = 2
+
+    ms_cfg_file = os.path.join(work_dir, 'ms_yolox_s_8xb16_300e_coco.json')
+    from easycv.utils.ms_utils import to_ms_config
+
+    if is_master():
+        to_ms_config(
+            cfg,
+            dump=True,
+            task=Tasks.image_object_detection,
+            ms_model_name=Models.yolox,
+            pipeline_name=Pipelines.easycv_detection,
+            save_path=ms_cfg_file)
+
+    trainer_name = Trainers.easycv
+    kwargs = dict(
+        task=Tasks.image_object_detection,
+        cfg_file=ms_cfg_file,
+        launcher='pytorch' if dist else None)
+
+    trainer = build_trainer(trainer_name, kwargs)
+    trainer.train()
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSingleGpu(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipIf(
+        True, 'The test cases are all run in the master process, '
+        'cause registry conflicts, and it should run in the subprocess.')
+    def test_single_gpu(self):
+        # TODO: run in subprocess
+        train_func(self.tmp_dir)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00013
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 10
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00157
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 10
+            }, json.loads(lines[3]))
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class EasyCVTrainerTestMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_multi_gpus(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            log_config=2,
+            imgs_per_gpu=5)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0002
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 5
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0018
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 5
+            }, json.loads(lines[3]))
+
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
new file mode 100644
index 00000000..0da47ef6
--- /dev/null
+++ b/tests/trainers/easycv/test_segformer.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import requests
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+from modelscope.utils.torch_utils import is_master
+
+
+def _download_data(url, save_dir):
+    r = requests.get(url, verify=True)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    zip_name = os.path.split(url)[-1]
+    save_path = os.path.join(save_dir, zip_name)
+    with open(save_path, 'wb') as f:
+        f.write(r.content)
+
+    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
+    shutil.unpack_archive(save_path, unpack_dir)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSegformer(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+        from modelscope.trainers.easycv.trainer import EasyCVEpochBasedTrainer
+
+        url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco_stuff164k.zip'
+        data_dir = os.path.join(self.tmp_dir, 'data')
+        if is_master():
+            _download_data(url, data_dir)
+
+        # adapt to ditributed mode
+        from easycv.utils.test_util import pseudo_dist_init
+        pseudo_dist_init()
+
+        root_path = os.path.join(data_dir, 'small_coco_stuff164k')
+        cfg_options = {
+            'train.max_epochs':
+            2,
+            'dataset.train.data_source.img_root':
+            os.path.join(root_path, 'train2017'),
+            'dataset.train.data_source.label_root':
+            os.path.join(root_path, 'annotations/train2017'),
+            'dataset.train.data_source.split':
+            os.path.join(root_path, 'train.txt'),
+            'dataset.val.data_source.img_root':
+            os.path.join(root_path, 'val2017'),
+            'dataset.val.data_source.label_root':
+            os.path.join(root_path, 'annotations/val2017'),
+            'dataset.val.data_source.split':
+            os.path.join(root_path, 'val.txt'),
+        }
+
+        trainer_name = Trainers.easycv
+        kwargs = dict(
+            task=Tasks.image_segmentation,
+            model='EasyCV/EasyCV-Segformer-b0',
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_single_gpu_segformer(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index d934a86c..8b89fa68 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -4,6 +4,8 @@ import copy
 import tempfile
 import unittest
 
+import json
+
 from modelscope.utils.config import Config, check_config
 
 obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -43,7 +45,8 @@ class ConfigTest(unittest.TestCase):
             self.assertEqual(pretty_text, cfg.dump())
             cfg.dump(ofile.name)
             with open(ofile.name, 'r') as infile:
-                self.assertEqual(json_str, infile.read())
+                self.assertDictEqual(
+                    json.loads(json_str), json.loads(infile.read()))
 
         with tempfile.NamedTemporaryFile(suffix='.yaml') as ofile:
             cfg.dump(ofile.name)

From 20a935d4065e6b05dcd5e688108cbe337519e95c Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Fri, 26 Aug 2022 14:54:45 +0800
Subject: [PATCH 014/175] [to #42322933] add gpt3 base finetune
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 gpt3 中小模型单机单卡下的 finetune 代码
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9899004
---
 modelscope/models/nlp/gpt3/modeling_gpt3.py   | 22 +++--
 .../models/nlp/palm_v2/modeling_palm.py       | 10 +-
 .../nlp/palm_v2/palm_for_text_generation.py   |  7 +-
 modelscope/preprocessors/nlp.py               | 19 ++--
 ...er.py => test_finetune_text_generation.py} | 92 +++++++++++++------
 5 files changed, 101 insertions(+), 49 deletions(-)
 rename tests/trainers/{test_text_generation_trainer.py => test_finetune_text_generation.py} (56%)

diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index f7024713..4e30f697 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -16,9 +16,10 @@ import math
 import os
 from typing import Optional, Union
 
+import addict
 import torch
-from addict import Dict
-from torch.nn import Dropout, Embedding, LayerNorm, Linear, Module, Softmax
+from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear,
+                      Module, Softmax)
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
@@ -308,20 +309,25 @@ class GPT3Model(PreTrainedModel):
                 input_ids,
                 attention_mask=None,
                 position_ids=None,
+                labels=None,
                 **kwargs):
         seq_length = input_ids.size(1)
-        if attention_mask is None:
-            attention_mask = torch.tril(
-                torch.ones((1, seq_length, seq_length),
-                           dtype=torch.long,
-                           device=input_ids.device))
+        attention_mask = torch.tril(
+            torch.ones((1, 1, seq_length, seq_length),
+                       dtype=torch.long,
+                       device=input_ids.device))
         if position_ids is None:
             position_ids = torch.arange(
                 seq_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         logits = self.language_model(input_ids, attention_mask, position_ids)
-        return Dict(logits=logits)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return addict.Dict(loss=loss, logits=logits)
 
     @classmethod
     def from_pretrained(
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index 1cbf4f58..ff6fd732 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -6,6 +6,7 @@ import subprocess
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
+import addict
 import json
 import numpy as np
 import torch
@@ -726,10 +727,11 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
                                    self.palm.vocab_size,
                                    config.label_smoothing)
 
-    def forward(self, src, tgt, mask_src):
-        output = self.palm(src, tgt, mask_src)[0]
-        loss = self.loss(tgt, output)
-        return loss
+    def forward(self, input_ids, attention_mask, labels):
+        output = self.palm(
+            src=input_ids, tgt=labels, mask_src=attention_mask)[0]
+        loss = self.loss(labels, output)
+        return addict.Dict(loss=loss)
 
 
 class Translator(nn.Module):
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index 98aa56c7..ae92427e 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -63,14 +63,15 @@ class PalmForTextGeneration(TorchModel):
                     }
         """
         if self.training:
-            return {'loss': self.model(**input)}
+            return self.model(**input)
         else:
-            outputs = self.generator(input['src'], input['mask_src'])
+            outputs = self.generator(input['input_ids'],
+                                     input['attention_mask'])
             preds = outputs['predictions']
             pred_ids_list = [
                 pred_batch[0].cpu().numpy().tolist() for pred_batch in preds
             ]
-            tgt_ids_list = input['tgt'].cpu().numpy().tolist()
+            tgt_ids_list = input['labels'].cpu().numpy().tolist()
             return {
                 'preds': self._evaluate_postprocess(pred_ids_list),
                 'tgts': self._evaluate_postprocess(tgt_ids_list)
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 094cbfe2..345d3711 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -368,15 +368,20 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
     def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         if self._mode == ModeKeys.INFERENCE:
             return super().__call__(data)
-        src_txt = data['src_txt']
-        tgt_txt = data['tgt_txt']
-        src_rst = super().__call__(src_txt)
-        tgt_rst = super().__call__(tgt_txt)
+        src_rst = super().__call__(data['src_txt'])
+        src_input_ids = src_rst['input_ids']
+        src_attention_mask = src_rst['attention_mask']
+        if 'tgt_txt' in data:
+            labels = super().__call__(data['tgt_txt'])['input_ids']
+        else:
+            labels = src_input_ids[1:]
+            src_input_ids = src_input_ids[:-1]
+            src_attention_mask = src_attention_mask[:-1]
 
         return {
-            'src': src_rst['input_ids'],
-            'tgt': tgt_rst['input_ids'],
-            'mask_src': src_rst['attention_mask']
+            'input_ids': src_input_ids,
+            'attention_mask': src_attention_mask,
+            'labels': labels,
         }
 
 
diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_finetune_text_generation.py
similarity index 56%
rename from tests/trainers/test_text_generation_trainer.py
rename to tests/trainers/test_finetune_text_generation.py
index a60bc903..8cdfdf01 100644
--- a/tests/trainers/test_text_generation_trainer.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -6,14 +6,14 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
-from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
+from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
-class TestTextGenerationTrainer(unittest.TestCase):
+class TestFinetuneTextGeneration(unittest.TestCase):
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -21,40 +21,41 @@ class TestTextGenerationTrainer(unittest.TestCase):
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
 
-        self.model_id = 'damo/nlp_palm2.0_text-generation_english-base'
-
-        # todo: Replace below scripts with MsDataset.load when the formal dataset service is ready
         from datasets import Dataset
-        dataset_dict = {
+
+        src_dataset_dict = {
             'src_txt': [
                 'This is test sentence1-1', 'This is test sentence2-1',
                 'This is test sentence3-1'
-            ],
+            ]
+        }
+        src_tgt_dataset_dict = {
+            'src_txt':
+            src_dataset_dict['src_txt'],
             'tgt_txt': [
                 'This is test sentence1-2', 'This is test sentence2-2',
                 'This is test sentence3-2'
             ]
         }
-        dataset = Dataset.from_dict(dataset_dict)
 
-        class MsDatasetDummy(MsDataset):
+        self.src_dataset = MsDataset(Dataset.from_dict(src_dataset_dict))
+        self.src_tgt_dataset = MsDataset(
+            Dataset.from_dict(src_tgt_dataset_dict))
 
-            def __len__(self):
-                return len(self._hf_ds)
-
-        self.dataset = MsDatasetDummy(dataset)
+        self.max_epochs = 3
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_trainer(self):
+    def test_trainer_with_palm(self):
 
         kwargs = dict(
-            model=self.model_id,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
+            model='damo/nlp_palm2.0_text-generation_english-base',
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(
@@ -62,30 +63,67 @@ class TestTextGenerationTrainer(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(3):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_trainer_with_model_and_args(self):
-        tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
+    def test_trainer_with_palm_with_model_and_args(self):
 
-        cache_path = snapshot_download(self.model_id)
+        cache_path = snapshot_download(
+            'damo/nlp_palm2.0_text-generation_english-base')
         model = PalmForTextGeneration.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
-            max_epochs=2,
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_gpt3(self):
+
+        kwargs = dict(
+            model='damo/nlp_gpt3_text-generation_chinese-base',
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_gpt3_with_model_and_args(self):
+
+        cache_path = snapshot_download(
+            'damo/nlp_gpt3_text-generation_chinese-base')
+        model = GPT3ForTextGeneration.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skip

From 39485426e7c08aa1ab77fbd64639c289a156d796 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Fri, 26 Aug 2022 22:41:13 +0800
Subject: [PATCH 015/175] [to #42322933]:fix msdataset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 修复了zip文件不同打包模式下返回路径错误问题。
* 修复了替换了数据集文件重新下载时校验失败问题。
* 修复dataset oss文件在 REUSE 模式下重复下载的问题。
* 修复了csv数据集的meta json文件中某个split的meta和file字段都为''时加载所有split失败的问题。
 * 修复了不同版本datasets路径不一致的问题。
---
 ...mage_instance_segmentation_coco_dataset.py |  5 ++-
 .../msdatasets/utils/dataset_builder.py       | 37 +++++++------------
 modelscope/msdatasets/utils/dataset_utils.py  | 11 +++++-
 modelscope/msdatasets/utils/download_utils.py |  4 +-
 modelscope/msdatasets/utils/oss_utils.py      |  8 ++--
 tests/msdatasets/test_ms_dataset.py           |  7 ++--
 ...est_image_instance_segmentation_trainer.py |  4 ++
 7 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index 04c8e142..a001fe36 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -59,10 +59,13 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
                  preprocessor=None,
                  classes=None,
                  seg_prefix=None,
+                 folder_name=None,
                  test_mode=False,
                  filter_empty_gt=True,
                  **kwargs):
-        self.data_root = next(iter(split_config.values()))
+        data_root = next(iter(split_config.values()))
+        self.data_root = osp.join(data_root,
+                                  folder_name) if folder_name else data_root
         self.split = next(iter(split_config.keys()))
         self.preprocessor = preprocessor
 
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 85489c58..7180cb5b 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -8,7 +8,7 @@ from datasets.info import DatasetInfo
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock
 
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,7 +27,6 @@ class MsCsvDatasetBuilder(csv.Csv):
         zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
         **config_kwargs,
     ):
-        self.namespace = namespace
         super().__init__(
             cache_dir=cache_dir,
             name=subset_name,
@@ -37,7 +36,7 @@ class MsCsvDatasetBuilder(csv.Csv):
 
         self.name = dataset_name
         self.info.builder_name = self.name
-        self._cache_dir = self._build_cache_dir()
+        self._cache_dir = self._build_cache_dir(namespace=namespace)
         lock_path = os.path.join(
             self._cache_dir_root,
             self._cache_dir.replace(os.sep, '_') + '.lock')
@@ -48,7 +47,6 @@ class MsCsvDatasetBuilder(csv.Csv):
                     logger.info(
                         f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}'
                     )
-                    self.info = DatasetInfo.from_directory(self._cache_dir)
                 # dir exists but no data, remove the empty dir as data aren't available anymore
                 else:
                     logger.warning(
@@ -57,14 +55,17 @@ class MsCsvDatasetBuilder(csv.Csv):
                     os.rmdir(self._cache_dir)
         self.zip_data_files = zip_data_files
 
-    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+    def _relative_data_dir(self,
+                           with_version=True,
+                           with_hash=True,
+                           namespace=DEFAULT_DATASET_NAMESPACE) -> str:
         """Relative path of this dataset in cache_dir:
         Will be:
             self.name/self.config.version/self.hash/
         or if a namespace has been specified:
             self.namespace___self.name/self.config.version/self.hash/
         """
-        builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
+        builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}'
         builder_config = self.config
         hash = self.hash
         if builder_config:
@@ -76,10 +77,11 @@ class MsCsvDatasetBuilder(csv.Csv):
             builder_data_dir = os.path.join(builder_data_dir, hash)
         return builder_data_dir
 
-    def _build_cache_dir(self):
+    def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
         builder_data_dir = os.path.join(
             self._cache_dir_root,
-            self._relative_data_dir(with_version=False, with_hash=True))
+            self._relative_data_dir(
+                with_version=False, with_hash=True, namespace=namespace))
 
         return builder_data_dir
 
@@ -97,15 +99,8 @@ class MsCsvDatasetBuilder(csv.Csv):
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        'files':
-                        dl_manager.iter_files(files),
-                        'base_dir':
-                        os.path.join(
-                            zip_data_files.get(split_name),
-                            os.path.splitext(
-                                self.zip_data_files.get(split_name))[0])
-                        if self.zip_data_files.get(split_name) else
-                        zip_data_files.get(split_name)
+                        'files': dl_manager.iter_files(files),
+                        'base_dir': zip_data_files.get(split_name)
                     }))
         return splits
 
@@ -181,12 +176,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self._download_and_prepare(dl_manager=dl_manager)
 
     def _download_and_prepare(self, dl_manager):
-        split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
-        self.split_path_dict = {
-            k: os.path.join(v,
-                            os.path.splitext(self.zip_data_files[k])[0])
-            for k, v in split_path_dict.items()
-        }
+        self.split_path_dict = dl_manager.download_and_extract(
+            self.zip_data_files)
 
     def as_dataset(self):
         return ExternalDataset(self.split_path_dict, self._config_kwargs)
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 09556d84..08a6de84 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -11,6 +11,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 logger = get_logger()
 
 
+def format_dataset_structure(dataset_structure):
+    return {
+        k: v
+        for k, v in dataset_structure.items()
+        if (v.get('meta') or v.get('file'))
+    }
+
+
 def get_target_dataset_structure(dataset_structure: dict,
                                  subset_name: Optional[str] = None,
                                  split: Optional[str] = None):
@@ -56,7 +64,8 @@ def get_target_dataset_structure(dataset_structure: dict,
             f'No subset_name specified, defaulting to the {target_subset_name}'
         )
     # verify dataset split
-    target_dataset_structure = dataset_structure[target_subset_name]
+    target_dataset_structure = format_dataset_structure(
+        dataset_structure[target_subset_name])
     if split and split not in target_dataset_structure:
         raise ValueError(
             f'split {split} not found. Available: {target_dataset_structure.keys()}'
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index bc637f0e..eb1c99ef 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -34,8 +34,8 @@ class DatasetDownloadManager(DownloadManager):
         url_or_filename = str(url_or_filename)
         if is_relative_path(url_or_filename):
             # fetch oss files
-            return self.oss_utilities.download(url_or_filename,
-                                               self.download_config.cache_dir)
+            return self.oss_utilities.download(
+                url_or_filename, download_config=download_config)
         else:
             return cached_path(
                 url_or_filename, download_config=download_config)
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 033c8b96..82d43bef 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -24,7 +24,8 @@ class OssUtilities:
             rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
             print('\r{0}% '.format(rate), end='', flush=True)
 
-    def download(self, oss_file_name, cache_dir):
+    def download(self, oss_file_name, download_config):
+        cache_dir = download_config.cache_dir
         candidate_key = os.path.join(self.oss_dir, oss_file_name)
         candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
         file_oss_key = candidate_key if self.bucket.object_exists(
@@ -32,8 +33,9 @@ class OssUtilities:
         filename = hash_url_to_filename(file_oss_key, etag=None)
         local_path = os.path.join(cache_dir, filename)
 
-        self.bucket.get_object_to_file(
-            file_oss_key, local_path, progress_callback=self._percentage)
+        if download_config.force_download or not os.path.exists(local_path):
+            self.bucket.get_object_to_file(
+                file_oss_key, local_path, progress_callback=self._percentage)
         return local_path
 
     def upload(self, oss_file_name: str, local_file_path: str) -> str:
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 0d8c8a4d..1d62d2d1 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -37,9 +37,10 @@ class MsDatasetTest(unittest.TestCase):
             'pets_small',
             namespace=DEFAULT_DATASET_NAMESPACE,
             split='train',
-            download_mode=DownloadMode.FORCE_REDOWNLOAD,
-            classes=('1', '2'))
-        print(ms_ds_train._hf_ds.config_kwargs)
+            classes=('1', '2'),
+            folder_name='Pets')
+        print(ms_ds_train.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ms_csv_basic(self):
diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py
index c8557ff5..774f8fa8 100644
--- a/tests/trainers/test_image_instance_segmentation_trainer.py
+++ b/tests/trainers/test_image_instance_segmentation_trainer.py
@@ -44,18 +44,21 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
                 name='pets_small',
                 split='train',
                 classes=('Cat', 'Dog'),
+                folder_name='Pets',
                 test_mode=False)
         if val_data_cfg is None:
             val_data_cfg = ConfigDict(
                 name='pets_small',
                 split='validation',
                 classes=('Cat', 'Dog'),
+                folder_name='Pets',
                 test_mode=True)
 
         self.train_dataset = MsDataset.load(
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
             classes=train_data_cfg.classes,
+            folder_name=train_data_cfg.folder_name,
             test_mode=train_data_cfg.test_mode)
         assert self.train_dataset.config_kwargs[
             'classes'] == train_data_cfg.classes
@@ -66,6 +69,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
             dataset_name=val_data_cfg.name,
             split=val_data_cfg.split,
             classes=val_data_cfg.classes,
+            folder_name=val_data_cfg.folder_name,
             test_mode=val_data_cfg.test_mode)
         assert self.eval_dataset.config_kwargs[
             'classes'] == val_data_cfg.classes

From 285192850d72b0f3e2bcd5a3e792f72cb5b440a5 Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Sat, 27 Aug 2022 10:48:56 +0800
Subject: [PATCH 016/175] =?UTF-8?q?[to=20#42322933]=20feat(RealtimeObjectD?=
 =?UTF-8?q?etection):=E6=96=B0=E5=A2=9E=E5=AE=9E=E6=97=B6=E6=A3=80?=
 =?UTF-8?q?=E6=B5=8Bpipeline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增实时目标检测pipeline
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9788299
---
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |   2 +-
 .../cv/realtime_object_detection/__init__.py  |  21 ++
 .../realtime_detector.py                      |  85 +++++++
 .../yolox/__init__.py                         |   0
 .../yolox/data/__init__.py                    |   0
 .../yolox/data/data_augment.py                |  69 ++++++
 .../yolox/exp/__init__.py                     |   5 +
 .../yolox/exp/base_exp.py                     |  12 +
 .../yolox/exp/build.py                        |  18 ++
 .../yolox/exp/default/__init__.py             |   5 +
 .../yolox/exp/default/yolox_nano.py           |  46 ++++
 .../yolox/exp/default/yolox_s.py              |  13 ++
 .../yolox/exp/default/yolox_tiny.py           |  20 ++
 .../yolox/exp/yolox_base.py                   |  59 +++++
 .../yolox/models/__init__.py                  |   7 +
 .../yolox/models/darknet.py                   | 189 ++++++++++++++++
 .../yolox/models/network_blocks.py            | 213 ++++++++++++++++++
 .../yolox/models/yolo_fpn.py                  |  80 +++++++
 .../yolox/models/yolo_head.py                 | 182 +++++++++++++++
 .../yolox/models/yolo_pafpn.py                | 126 +++++++++++
 .../yolox/models/yolox.py                     |  33 +++
 .../yolox/utils/__init__.py                   |   5 +
 .../yolox/utils/boxes.py                      | 107 +++++++++
 modelscope/pipelines/cv/__init__.py           |   3 +
 .../cv/realtime_object_detection_pipeline.py  |  50 ++++
 modelscope/utils/cv/image_utils.py            |   7 +
 .../test_realtime_object_detection.py         |  52 +++++
 28 files changed, 1410 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/cv/realtime_object_detection/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/realtime_detector.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
 create mode 100644 modelscope/pipelines/cv/realtime_object_detection_pipeline.py
 create mode 100644 tests/pipelines/test_realtime_object_detection.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 24f2f748..153ca9b4 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -11,6 +11,7 @@ class Models(object):
     """
     # vision models
     detection = 'detection'
+    realtime_object_detection = 'realtime-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
     nafnet = 'nafnet'
@@ -111,6 +112,7 @@ class Pipelines(object):
     image_super_resolution = 'rrdb-image-super-resolution'
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
+    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
     face_recognition = 'ir101-face-recognition-cfglint'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 227be2c7..74451c31 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -7,5 +7,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
                object_detection, product_retrieval_embedding,
-               salient_detection, super_resolution,
+               realtime_object_detection, salient_detection, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py
new file mode 100644
index 00000000..aed13cec
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .realtime_detector import RealtimeDetector
+else:
+    _import_structure = {
+        'realtime_detector': ['RealtimeDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
new file mode 100644
index 00000000..b147f769
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -0,0 +1,85 @@
+import argparse
+import logging as logger
+import os
+import os.path as osp
+import time
+
+import cv2
+import json
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .yolox.data.data_augment import ValTransform
+from .yolox.exp import get_exp_by_name
+from .yolox.utils import postprocess
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Models.realtime_object_detection)
+class RealtimeDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # model type
+        self.exp = get_exp_by_name(self.config.model_type)
+
+        # build model
+        self.model = self.exp.get_model()
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt = torch.load(model_path, map_location='cpu')
+
+        # load the model state dict
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        # params setting
+        self.exp.num_classes = self.config.num_classes
+        self.confthre = self.config.conf_thr
+        self.num_classes = self.exp.num_classes
+        self.nmsthre = self.exp.nmsthre
+        self.test_size = self.exp.test_size
+        self.preproc = ValTransform(legacy=False)
+
+    def inference(self, img):
+        with torch.no_grad():
+            outputs = self.model(img)
+        return outputs
+
+    def forward(self, inputs):
+        return self.inference(inputs)
+
+    def preprocess(self, img):
+        img = LoadImage.convert_to_ndarray(img)
+        height, width = img.shape[:2]
+        self.ratio = min(self.test_size[0] / img.shape[0],
+                         self.test_size[1] / img.shape[1])
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+
+        return img
+
+    def postprocess(self, input):
+        outputs = postprocess(
+            input,
+            self.num_classes,
+            self.confthre,
+            self.nmsthre,
+            class_agnostic=True)
+
+        if len(outputs) == 1:
+            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
+            scores = outputs[0][:, 5].cpu().numpy()
+            labels = outputs[0][:, 6].cpu().int().numpy()
+
+        return bboxes, scores, labels
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
new file mode 100644
index 00000000..b52a65fe
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
@@ -0,0 +1,69 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+from ..utils import xyxy2cxcywh
+
+
+def preproc(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones(
+            (input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+
+
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+
+    dimension -> tensorize -> color adj
+
+    Arguments:
+        resize (int): input dimension to SSD
+        rgb_means ((int,int,int)): average RGB of the dataset
+            (104,117,123)
+        swap ((int,int,int)): final order of channels
+
+    Returns:
+        transform (transform) : callable transform to be applied to test/val
+        data
+    """
+
+    def __init__(self, swap=(2, 0, 1), legacy=False):
+        self.swap = swap
+        self.legacy = legacy
+
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.swap)
+        if self.legacy:
+            img = img[::-1, :, :].copy()
+            img /= 255.0
+            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        return img, np.zeros((1, 5))
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
new file mode 100644
index 00000000..e8e3be15
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .base_exp import BaseExp
+from .build import get_exp_by_name
+from .yolox_base import Exp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
new file mode 100644
index 00000000..a4278cbf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
@@ -0,0 +1,12 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from abc import ABCMeta, abstractmethod
+
+from torch.nn import Module
+
+
+class BaseExp(metaclass=ABCMeta):
+
+    @abstractmethod
+    def get_model(self) -> Module:
+        pass
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
new file mode 100644
index 00000000..4858100c
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -0,0 +1,18 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import sys
+
+
+def get_exp_by_name(exp_name):
+    exp = exp_name.replace('-',
+                           '_')  # convert string like "yolox-s" to "yolox_s"
+    if exp == 'yolox_s':
+        from .default import YoloXSExp as YoloXExp
+    elif exp == 'yolox_nano':
+        from .default import YoloXNanoExp as YoloXExp
+    elif exp == 'yolox_tiny':
+        from .default import YoloXTinyExp as YoloXExp
+    else:
+        pass
+    return YoloXExp()
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
new file mode 100644
index 00000000..552bbccd
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .yolox_nano import YoloXNanoExp
+from .yolox_s import YoloXSExp
+from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
new file mode 100644
index 00000000..330eef16
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
@@ -0,0 +1,46 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+import torch.nn as nn
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXNanoExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXNanoExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.test_size = (416, 416)
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if 'model' not in self.__dict__:
+            from ...models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(
+                self.depth,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True,
+            )
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
new file mode 100644
index 00000000..5a123b37
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
@@ -0,0 +1,13 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXSExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXSExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
new file mode 100644
index 00000000..a80d0f2d
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
@@ -0,0 +1,20 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXTinyExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXTinyExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(
+            os.path.realpath(__file__))[1].split('.')[0]
+        self.enable_mixup = False
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
new file mode 100644
index 00000000..a2a41535
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -0,0 +1,59 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+
+class Exp(BaseExp):
+
+    def __init__(self):
+        super().__init__()
+
+        # ---------------- model config ---------------- #
+        # detect classes number of model
+        self.num_classes = 80
+        # factor of model depth
+        self.depth = 1.00
+        # factor of model width
+        self.width = 1.00
+        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        self.act = 'silu'
+        # -----------------  testing config ------------------ #
+        # output image size during evaluation/test
+        self.test_size = (640, 640)
+        # confidence threshold during evaluation/test,
+        # boxes whose scores are less than test_conf will be filtered
+        self.test_conf = 0.01
+        # nms threshold
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from ..models import YOLOX, YOLOPAFPN, YOLOXHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, 'model', None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels, act=self.act)
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        self.model.train()
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
new file mode 100644
index 00000000..20b1a0d1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
@@ -0,0 +1,7 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .darknet import CSPDarknet, Darknet
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
new file mode 100644
index 00000000..8ece2a1e
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
@@ -0,0 +1,189 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from torch import nn
+
+from .network_blocks import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+            self,
+            depth,
+            in_channels=3,
+            stem_out_channels=32,
+            out_features=('dark3', 'dark4', 'dark5'),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(
+                in_channels, stem_out_channels, ksize=3, stride=1,
+                act='lrelu'),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2))
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2))
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2))
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2],
+                                 in_channels * 2),
+        )
+
+    def make_group_layer(self,
+                         in_channels: int,
+                         num_blocks: int,
+                         stride: int = 1):
+        'starts with conv layer then has `num_blocks` `ResLayer`'
+        return [
+            BaseConv(
+                in_channels,
+                in_channels * 2,
+                ksize=3,
+                stride=stride,
+                act='lrelu'),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            BaseConv(in_filters, filters_list[0], 1, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            SPPBottleneck(
+                in_channels=filters_list[1],
+                out_channels=filters_list[0],
+                activation='lrelu',
+            ),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[1], filters_list[0], 1, stride=1, act='lrelu'),
+        ])
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
new file mode 100644
index 00000000..fd15c1c1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
@@ -0,0 +1,213 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act='silu'):
+        super(BaseConv, self).__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super(DWConv, self).__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
new file mode 100644
index 00000000..0cbebb09
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
@@ -0,0 +1,80 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=53,
+        in_features=['dark3', 'dark4', 'dark5'],
+    ):
+        super(YOLOFPN, self).__init__()
+
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act='lrelu')
+
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            self._make_cbl(in_filters, filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+        ])
+        return m
+
+    def load_pretrained_model(self, filename='./weights/darknet53.mix.pth'):
+        with open(filename, 'rb') as f:
+            state_dict = torch.load(f, map_location='cpu')
+        print('loading pretrained weights...')
+        self.backbone.load_state_dict(state_dict)
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
new file mode 100644
index 00000000..1eef93a4
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
@@ -0,0 +1,182 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import bboxes_iou, meshgrid
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act='silu',
+        depthwise=False,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super(YOLOXHead, self).__init__()
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                ))
+            self.cls_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.reg_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction='none')
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none')
+        # self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                pass
+            else:
+                output = torch.cat(
+                    [reg_output,
+                     obj_output.sigmoid(),
+                     cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            pass
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
+                                dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
new file mode 100644
index 00000000..cd4258bf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
@@ -0,0 +1,126 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=('dark3', 'dark4', 'dark5'),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+    ):
+        super(YOLOPAFPN, self).__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width),
+            int(in_channels[1] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width),
+            int(in_channels[0] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
new file mode 100644
index 00000000..181c368b
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
@@ -0,0 +1,33 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super(YOLOX, self).__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None):
+        fpn_outs = self.backbone(x)
+        if self.training:
+            raise NotImplementedError('Training is not supported yet!')
+        else:
+            outputs = self.head(fpn_outs)
+
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
new file mode 100644
index 00000000..2c1ea489
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .boxes import *  # noqa
+
+__all__ = ['bboxes_iou', 'meshgrid', 'postprocess', 'xyxy2cxcywh', 'xyxy2xywh']
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
new file mode 100644
index 00000000..b29a3a04
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
@@ -0,0 +1,107 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torchvision
+
+_TORCH_VER = [int(x) for x in torch.__version__.split('.')[:2]]
+
+
+def meshgrid(*tensors):
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
+
+
+def postprocess(prediction,
+                num_classes,
+                conf_thre=0.7,
+                nms_thre=0.45,
+                class_agnostic=False):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(
+            image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+        conf_mask = image_pred[:, 4] * class_conf.squeeze()
+        conf_mask = (conf_mask >= conf_thre).squeeze()
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat(
+            (image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+
+        if class_agnostic:
+            nms_out_index = torchvision.ops.nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                nms_thre,
+            )
+        else:
+            nms_out_index = torchvision.ops.batched_nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                detections[:, 6],
+                nms_thre,
+            )
+
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index b1a513e5..2c062226 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
+    from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
     from .ocr_detection_pipeline import OCRDetectionPipeline
     from .ocr_recognition_pipeline import OCRRecognitionPipeline
@@ -75,6 +76,8 @@ else:
         ['Image2ImageTranslationPipeline'],
         'product_retrieval_embedding_pipeline':
         ['ProductRetrievalEmbeddingPipeline'],
+        'realtime_object_detection_pipeline':
+        ['RealtimeObjectDetectionPipeline'],
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generation_pipeline':
         ['Image2ImageGenerationPipeline'],
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
new file mode 100644
index 00000000..629720d1
--- /dev/null
+++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
@@ -0,0 +1,50 @@
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import json
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.realtime_object_detection import RealtimeDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.realtime_object_detection)
+class RealtimeObjectDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.model = RealtimeDetector(model)
+
+    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
+        output = self.model.preprocess(input)
+        return {'pre_output': output}
+
+    def forward(self, input: Tensor) -> Dict[Tensor, Dict[str, np.ndarray]]:
+        pre_output = input['pre_output']
+        forward_output = self.model(pre_output)
+        return {'forward_output': forward_output}
+
+    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
+                    **kwargs) -> str:
+        forward_output = input['forward_output']
+        bboxes, scores, labels = forward_output
+        return {
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 0ad0ef8f..ea1d95b5 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -70,6 +70,13 @@ def draw_box(image, box):
                   (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
 
 
+def realtime_object_detection_bbox_vis(image, bboxes):
+    for bbox in bboxes:
+        cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                      (255, 0, 0), 2)
+    return image
+
+
 def draw_keypoints(output, original_image):
     poses = np.array(output[OutputKeys.POSES])
     scores = np.array(output[OutputKeys.SCORES])
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
new file mode 100644
index 00000000..03ddacf4
--- /dev/null
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
+from modelscope.utils.test_utils import test_level
+
+
+class RealtimeObjectDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
+        self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
+        self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rt_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_nano(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_nano_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rtnano_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 00b448a2feb2982973c6716ba793d81fb5f1ef59 Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Sat, 27 Aug 2022 14:11:30 +0800
Subject: [PATCH 017/175] [to #42322933] support 3d body keypoints        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9862567

---
 data/test/videos/Walking.54138969.mp4         |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |  11 +-
 .../models/cv/body_3d_keypoints/__init__.py   |  23 ++
 .../cv/body_3d_keypoints/body_3d_pose.py      | 246 ++++++++++++++++++
 .../canonical_pose_modules.py                 | 233 +++++++++++++++++
 modelscope/outputs.py                         |  10 +
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../cv/body_3d_keypoints_pipeline.py          | 213 +++++++++++++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_body_3d_keypoints.py     |  49 ++++
 12 files changed, 792 insertions(+), 3 deletions(-)
 create mode 100644 data/test/videos/Walking.54138969.mp4
 create mode 100644 modelscope/models/cv/body_3d_keypoints/__init__.py
 create mode 100644 modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
 create mode 100644 modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
 create mode 100644 modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
 create mode 100644 tests/pipelines/test_body_3d_keypoints.py

diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4
new file mode 100644
index 00000000..1716695f
--- /dev/null
+++ b/data/test/videos/Walking.54138969.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
+size 44217644
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 153ca9b4..d9e53ca7 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -20,6 +20,7 @@ class Models(object):
     gpen = 'gpen'
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
@@ -95,6 +96,7 @@ class Pipelines(object):
     general_recognition = 'resnet101-general-recognition'
     cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
+    body_3d_keypoints = 'canonical_body-3d-keypoints_video'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
     easycv_detection = 'easycv-detection'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 74451c31..10040637 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -1,11 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+# yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
-               cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
-               face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_instance_segmentation,
+               body_3d_keypoints, cartoon, cmdssl_video_embedding,
+               crowd_counting, face_detection, face_generation,
+               image_classification, image_color_enhance, image_colorization,
+               image_denoise, image_instance_segmentation,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
                object_detection, product_retrieval_embedding,
                realtime_object_detection, salient_detection, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
+
+# yapf: enable
diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py
new file mode 100644
index 00000000..4bb83936
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .body_3d_pose import BodyKeypointsDetection3D
+
+else:
+    _import_structure = {
+        'body_3d_pose': ['BodyKeypointsDetection3D'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
new file mode 100644
index 00000000..87cd4962
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -0,0 +1,246 @@
+import logging
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
+    TemporalModel, TransCan3Dkeys)
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['BodyKeypointsDetection3D']
+
+
+class KeypointsTypes(object):
+    POSES_CAMERA = 'poses_camera'
+    POSES_TRAJ = 'poses_traj'
+
+
+@MODELS.register_module(
+    Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints)
+class BodyKeypointsDetection3D(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(cfg_path)
+        self._create_model()
+
+        if not osp.exists(model_path):
+            raise IOError(f'{model_path} is not exists.')
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.pretrained_state_dict = torch.load(
+            model_path, map_location=self._device)
+
+        self.load_pretrained()
+        self.to_device(self._device)
+        self.eval()
+
+    def _create_model(self):
+        self.model_pos = TemporalModel(
+            self.cfg.model.MODEL.IN_NUM_JOINTS,
+            self.cfg.model.MODEL.IN_2D_FEATURE,
+            self.cfg.model.MODEL.OUT_NUM_JOINTS,
+            filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS,
+            causal=self.cfg.model.MODEL.CAUSAL,
+            dropout=self.cfg.model.MODEL.DROPOUT,
+            channels=self.cfg.model.MODEL.CHANNELS,
+            dense=self.cfg.model.MODEL.DENSE)
+
+        receptive_field = self.model_pos.receptive_field()
+        self.pad = (receptive_field - 1) // 2
+        if self.cfg.model.MODEL.CAUSAL:
+            self.causal_shift = self.pad
+        else:
+            self.causal_shift = 0
+
+        self.model_traj = TransCan3Dkeys(
+            in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS
+            * self.cfg.model.MODEL.IN_2D_FEATURE,
+            num_features=1024,
+            out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE,
+            num_blocks=4,
+            time_window=receptive_field)
+
+    def eval(self):
+        self.model_pos.eval()
+        self.model_traj.eval()
+
+    def train(self):
+        self.model_pos.train()
+        self.model_traj.train()
+
+    def to_device(self, device):
+        self.model_pos = self.model_pos.to(device)
+        self.model_traj = self.model_traj.to(device)
+
+    def load_pretrained(self):
+        if 'model_pos' in self.pretrained_state_dict:
+            self.model_pos.load_state_dict(
+                self.pretrained_state_dict['model_pos'], strict=False)
+        else:
+            logging.error(
+                'Not load model pos from pretrained_state_dict, not in pretrained_state_dict'
+            )
+
+        if 'model_traj' in self.pretrained_state_dict:
+            self.model_traj.load_state_dict(
+                self.pretrained_state_dict['model_traj'], strict=False)
+        else:
+            logging.error(
+                'Not load model traj from pretrained_state_dict, not in pretrained_state_dict'
+            )
+        logging.info('Load pretrained model done.')
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """Proprocess of 2D input joints.
+
+        Args:
+            input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.
+
+        Returns:
+            Dict[str, Any]: canonical 2d points and root relative joints.
+        """
+        if 'cuda' == input.device.type:
+            input = input.data.cpu().numpy()
+        elif 'cpu' == input.device.type:
+            input = input.data.numpy()
+        pose2d = input
+
+        pose2d_canonical = self.canonicalize_2Ds(
+            pose2d, self.cfg.model.INPUT.FOCAL_LENGTH,
+            self.cfg.model.INPUT.CENTER)
+        pose2d_normalized = self.normalize_screen_coordinates(
+            pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H)
+        pose2d_rr = pose2d_normalized
+        pose2d_rr[:, 1:] -= pose2d_rr[:, :1]
+
+        # expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2]
+        pose2d_rr = np.expand_dims(
+            np.pad(
+                pose2d_rr,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_canonical = np.expand_dims(
+            np.pad(
+                pose2d_canonical,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32))
+        pose2d_canonical = torch.from_numpy(
+            pose2d_canonical.astype(np.float32))
+
+        inputs_2d = pose2d_rr.clone()
+        if torch.cuda.is_available():
+            inputs_2d = inputs_2d.cuda(non_blocking=True)
+
+        # Positional model
+        if self.cfg.model.MODEL.USE_2D_OFFSETS:
+            inputs_2d[:, :, 0] = 0
+        else:
+            inputs_2d[:, :, 1:] += inputs_2d[:, :, :1]
+
+        return {
+            'inputs_2d': inputs_2d,
+            'pose2d_rr': pose2d_rr,
+            'pose2d_canonical': pose2d_canonical
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """3D human pose estimation.
+
+        Args:
+            input (Dict):
+                inputs_2d:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_rr:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2]
+                NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number)
+
+        Returns:
+            Dict[str, Any]:
+                "camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
+                    3D human pose keypoints in camera frame.
+                "camera_traj": Tensor, [1, NUM_FRAME, 1, 3],
+                    root keypoints coordinates in camere frame.
+        """
+        inputs_2d = input['inputs_2d']
+        pose2d_rr = input['pose2d_rr']
+        pose2d_canonical = input['pose2d_canonical']
+        with torch.no_grad():
+            # predict 3D pose keypoints
+            predicted_3d_pos = self.model_pos(inputs_2d)
+
+            # predict global trajectory
+            b1, w1, n1, d1 = inputs_2d.shape
+
+            input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr,
+                                                   pose2d_canonical)
+            b1, w1, n1, d1 = input_pose2d_abs.size()
+            b2, w2, n2, d2 = predicted_3d_pos.size()
+
+            if torch.cuda.is_available():
+                input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True)
+
+            predicted_3d_traj = self.model_traj(
+                input_pose2d_abs.view(b1, w1, n1 * d1),
+                predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3)
+
+            predict_dict = {
+                KeypointsTypes.POSES_CAMERA: predicted_3d_pos,
+                KeypointsTypes.POSES_TRAJ: predicted_3d_traj
+            }
+
+        return predict_dict
+
+    def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
+                       pose2d_canonical):
+        pad = self.pad
+        w = input_video_frame_num - pad * 2
+
+        lst_pose2d_rr = []
+        lst_pose2d_cannoical = []
+        for i in range(pad, w + pad):
+            lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
+            lst_pose2d_cannoical.append(pose2d_canonical[:,
+                                                         i - pad:i + pad + 1])
+
+        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
+        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
+
+        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
+            input_pose2d_abs = input_pose2d_cannoical.clone()
+        else:
+            input_pose2d_abs = input_pose2d_rr.clone()
+            input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]
+
+        return input_pose2d_abs
+
+    def canonicalize_2Ds(self, pos2d, f, c):
+        cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
+        fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
+        canoical_2Ds = (pos2d - cs) / fs
+        return canoical_2Ds
+
+    def normalize_screen_coordinates(self, X, w, h):
+        assert X.shape[-1] == 2
+
+        # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+        return X / w * 2 - [1, h / w]
diff --git a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
new file mode 100644
index 00000000..b3eac2e5
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -0,0 +1,233 @@
+# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
+import torch
+import torch.nn as nn
+
+
+class TemporalModelBase(nn.Module):
+    """
+    Do not instantiate this class.
+    """
+
+    def __init__(self, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal, dropout, channels):
+        super().__init__()
+
+        # Validate input
+        for fw in filter_widths:
+            assert fw % 2 != 0, 'Only odd filter widths are supported'
+
+        self.num_joints_in = num_joints_in
+        self.in_features = in_features
+        self.num_joints_out = num_joints_out
+        self.filter_widths = filter_widths
+
+        self.drop = nn.Dropout(dropout)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.pad = [filter_widths[0] // 2]
+        self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1)
+        self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def receptive_field(self):
+        """
+        Return the total receptive field of this model as # of frames.
+        """
+        frames = 0
+        for f in self.pad:
+            frames += f
+        return 1 + 2 * frames
+
+    def total_causal_shift(self):
+        """
+        Return the asymmetric offset for sequence padding.
+        The returned value is typically 0 if causal convolutions are disabled,
+        otherwise it is half the receptive field.
+        """
+        frames = self.causal_shift[0]
+        next_dilation = self.filter_widths[0]
+        for i in range(1, len(self.filter_widths)):
+            frames += self.causal_shift[i] * next_dilation
+            next_dilation *= self.filter_widths[i]
+        return frames
+
+    def forward(self, x):
+        assert len(x.shape) == 4
+        assert x.shape[-2] == self.num_joints_in
+        assert x.shape[-1] == self.in_features
+
+        sz = x.shape[:3]
+        x = x.view(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+
+        x = self._forward_blocks(x)
+
+        x = x.permute(0, 2, 1)
+        x = x.view(sz[0], -1, self.num_joints_out, 3)
+
+        return x
+
+
+class TemporalModel(TemporalModelBase):
+    """
+    Reference 3D pose estimation model with temporal convolutions.
+    This implementation can be used for all use-cases.
+    """
+
+    def __init__(self,
+                 num_joints_in,
+                 in_features,
+                 num_joints_out,
+                 filter_widths,
+                 causal=False,
+                 dropout=0.25,
+                 channels=1024,
+                 dense=False):
+        """
+        Initialize this model.
+
+        Arguments:
+        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
+        in_features -- number of input features for each joint (typically 2 for 2D input)
+        num_joints_out -- number of output joints (can be different than input)
+        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
+        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
+        dropout -- dropout probability
+        channels -- number of convolution channels
+        dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
+        """
+        super().__init__(num_joints_in, in_features, num_joints_out,
+                         filter_widths, causal, dropout, channels)
+
+        self.expand_conv = nn.Conv1d(
+            num_joints_in * in_features,
+            channels,
+            filter_widths[0],
+            bias=False)
+
+        layers_conv = []
+        layers_bn = []
+
+        self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
+        next_dilation = filter_widths[0]
+        for i in range(1, len(filter_widths)):
+            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
+            self.causal_shift.append((filter_widths[i] // 2
+                                      * next_dilation) if causal else 0)
+
+            layers_conv.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    filter_widths[i] if not dense else (2 * self.pad[-1] + 1),
+                    dilation=next_dilation if not dense else 1,
+                    bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+            layers_conv.append(
+                nn.Conv1d(channels, channels, 1, dilation=1, bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+
+            next_dilation *= filter_widths[i]
+
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def _forward_blocks(self, x):
+        x = self.drop(self.relu(self.expand_bn(self.expand_conv(x))))
+        for i in range(len(self.pad) - 1):
+            pad = self.pad[i + 1]
+            shift = self.causal_shift[i + 1]
+            res = x[:, :, pad + shift:x.shape[2] - pad + shift]
+            x = self.drop(
+                self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))))
+            x = res + self.drop(
+                self.relu(self.layers_bn[2 * i + 1](
+                    self.layers_conv[2 * i + 1](x))))
+
+        x = self.shrink(x)
+        return x
+
+
+# regression of the trajectory
+class TransCan3Dkeys(nn.Module):
+
+    def __init__(self,
+                 in_channels=74,
+                 num_features=256,
+                 out_channels=44,
+                 time_window=10,
+                 num_blocks=2):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_features = num_features
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.time_window = time_window
+
+        self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1)
+        self.conv1 = nn.Sequential(
+            nn.ReplicationPad1d(1),
+            nn.Conv1d(
+                self.in_channels, self.num_features, kernel_size=3,
+                bias=False), self.expand_bn, nn.ReLU(inplace=True),
+            nn.Dropout(p=0.25))
+        self._make_blocks()
+        self.pad = nn.ReplicationPad1d(4)
+        self.relu = nn.ReLU(inplace=True)
+        self.drop = nn.Dropout(p=0.25)
+        self.reduce = nn.Conv1d(
+            self.num_features, self.num_features, kernel_size=self.time_window)
+        self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500)
+        self.embedding_3d_2 = nn.Linear(500, 500)
+        self.LReLU1 = nn.LeakyReLU()
+        self.LReLU2 = nn.LeakyReLU()
+        self.LReLU3 = nn.LeakyReLU()
+        self.out1 = nn.Linear(self.num_features + 500, self.num_features)
+        self.out2 = nn.Linear(self.num_features, self.out_channels)
+
+    def _make_blocks(self):
+        layers_conv = []
+        layers_bn = []
+        for i in range(self.num_blocks):
+            layers_conv.append(
+                nn.Conv1d(
+                    self.num_features,
+                    self.num_features,
+                    kernel_size=5,
+                    bias=False,
+                    dilation=2))
+            layers_bn.append(nn.BatchNorm1d(self.num_features))
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def forward(self, p2ds, p3d):
+        """
+        Args:
+        x - (B x T x J x C)
+        """
+        B, T, C = p2ds.shape
+        x = p2ds.permute((0, 2, 1))
+        x = self.conv1(x)
+        for i in range(self.num_blocks):
+            pre = x
+            x = self.pad(x)
+            x = self.layers_conv[i](x)
+            x = self.layers_bn[i](x)
+            x = self.drop(self.relu(x))
+            x = pre + x
+        x_2d = self.relu(self.reduce(x))
+        x_2d = x_2d.view(B, -1)
+        x_3d = self.LReLU1(self.embedding_3d_1(p3d))
+        x = torch.cat((x_2d, x_3d), 1)
+        x = self.LReLU3(self.out1(x))
+        x = self.out2(x)
+        return x
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 2edd76a2..622d9034 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -189,6 +189,16 @@ TASK_OUTPUTS = {
     Tasks.body_2d_keypoints:
     [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
 
+    # 3D human body keypoints detection result for single sample
+    # {
+    #   "poses": [
+    #               [[x, y, z]*17],
+    #               [[x, y, z]*17],
+    #               [[x, y, z]*17]
+    #             ]
+    # }
+    Tasks.body_3d_keypoints: [OutputKeys.POSES],
+
     # video single object tracking result for single video
     # {
     #   "boxes": [
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index fa6705a7..f8f679e6 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -89,6 +89,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_diffusion_text-to-image-synthesis_tiny'),
     Tasks.body_2d_keypoints: (Pipelines.body_2d_keypoints,
                               'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
+    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
+                              'damo/cv_canonical_body-3d-keypoints_video'),
     Tasks.face_detection: (Pipelines.face_detection,
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 2c062226..640ffd4c 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
     from .action_recognition_pipeline import ActionRecognitionPipeline
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
+    from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
@@ -46,6 +47,7 @@ else:
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
+        'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
         'image_detection_pipeline': ['ImageDetectionPipeline'],
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
new file mode 100644
index 00000000..e9e4e9e8
--- /dev/null
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -0,0 +1,213 @@
+import os
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.body_3d_keypoints.body_3d_pose import (
+    BodyKeypointsDetection3D, KeypointsTypes)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def convert_2_h36m(joints, joints_nbr=15):
+    lst_mappings = [[0, 8], [1, 7], [2, 12], [3, 13], [4, 14], [5, 9], [6, 10],
+                    [7, 11], [8, 1], [9, 2], [10, 3], [11, 4], [12, 5],
+                    [13, 6], [14, 0]]
+    nbr, dim = joints.shape
+    h36m_joints = np.zeros((nbr, dim))
+    for mapping in lst_mappings:
+        h36m_joints[mapping[1]] = joints[mapping[0]]
+
+    if joints_nbr == 17:
+        lst_mappings_17 = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
+                                    [5, 5], [6, 6], [7, 8], [8, 10], [9, 11],
+                                    [10, 12], [11, 13], [12, 14], [13, 15],
+                                    [14, 16]])
+        h36m_joints_17 = np.zeros((17, 2))
+        h36m_joints_17[lst_mappings_17[:, 1]] = h36m_joints[lst_mappings_17[:,
+                                                                            0]]
+        h36m_joints_17[7] = (h36m_joints_17[0] + h36m_joints_17[8]) * 0.5
+        h36m_joints_17[9] = (h36m_joints_17[8] + h36m_joints_17[10]) * 0.5
+        h36m_joints = h36m_joints_17
+
+    return h36m_joints
+
+
+def smooth_pts(cur_pts, pre_pts, bbox, smooth_x=15.0, smooth_y=15.0):
+    if pre_pts is None:
+        return cur_pts
+
+    w, h = bbox[1] - bbox[0]
+    if w == 0 or h == 0:
+        return cur_pts
+
+    size_pre = len(pre_pts)
+    size_cur = len(cur_pts)
+    if (size_pre == 0 or size_cur == 0):
+        return cur_pts
+
+    factor_x = -(smooth_x / w)
+    factor_y = -(smooth_y / w)
+
+    for i in range(size_cur):
+        w_x = np.exp(factor_x * np.abs(cur_pts[i][0] - pre_pts[i][0]))
+        w_y = np.exp(factor_y * np.abs(cur_pts[i][1] - pre_pts[i][1]))
+        cur_pts[i][0] = (1.0 - w_x) * cur_pts[i][0] + w_x * pre_pts[i][0]
+        cur_pts[i][1] = (1.0 - w_y) * cur_pts[i][1] + w_y * pre_pts[i][1]
+    return cur_pts
+
+
+def smoothing(lst_kps, lst_bboxes, smooth_x=15.0, smooth_y=15.0):
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_smoothed_kps = []
+    prev_pts = None
+    for i in range(lst_kps.shape[0]):
+        smoothed_cur_kps = smooth_pts(lst_kps[i], prev_pts,
+                                      lst_bboxes[i][0:-1].reshape(2, 2),
+                                      smooth_x, smooth_y)
+        lst_smoothed_kps.append(smoothed_cur_kps)
+        prev_pts = smoothed_cur_kps
+
+    return np.array(lst_smoothed_kps)
+
+
+def convert_2_h36m_data(lst_kps, lst_bboxes, joints_nbr=15):
+    lst_kps = lst_kps.squeeze()
+    lst_bboxes = lst_bboxes.squeeze()
+
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_kps = smoothing(lst_kps, lst_bboxes)
+
+    keypoints = []
+    for i in range(lst_kps.shape[0]):
+        h36m_joints_2d = convert_2_h36m(lst_kps[i], joints_nbr=joints_nbr)
+        keypoints.append(h36m_joints_2d)
+    return keypoints
+
+
+@PIPELINES.register_module(
+    Tasks.body_3d_keypoints, module_name=Pipelines.body_3d_keypoints)
+class Body3DKeypointsPipeline(Pipeline):
+
+    def __init__(self, model: Union[str, BodyKeypointsDetection3D], **kwargs):
+        """Human body 3D pose estimation.
+
+        Args:
+            model (Union[str, BodyKeypointsDetection3D]): model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        self.keypoint_model_3d = model if isinstance(
+            model, BodyKeypointsDetection3D) else Model.from_pretrained(model)
+        self.keypoint_model_3d.eval()
+
+        # init human body 2D keypoints detection pipeline
+        self.human_body_2d_kps_det_pipeline = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
+        self.human_body_2d_kps_detector = pipeline(
+            Tasks.body_2d_keypoints,
+            model=self.human_body_2d_kps_det_pipeline,
+            device='gpu' if torch.cuda.is_available() else 'cpu')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        video_frames = self.read_video_frames(input)
+        if 0 == len(video_frames):
+            res = {'success': False, 'msg': 'get video frame failed.'}
+            return res
+
+        all_2d_poses = []
+        all_boxes_with_socre = []
+        max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME  # max video frame number to be predicted 3D joints
+        for i, frame in enumerate(video_frames):
+            kps_2d = self.human_body_2d_kps_detector(frame)
+            box = kps_2d['boxes'][
+                0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
+            pose = kps_2d['poses'][0]  # keypoints: [15, 2]
+            score = kps_2d['scores'][0]  # keypoints: [15, 2]
+            all_2d_poses.append(pose)
+            all_boxes_with_socre.append(
+                list(np.array(box).reshape(
+                    (-1))) + [score])  # construct to list with shape [5]
+            if (i + 1) >= max_frame:
+                break
+
+        all_2d_poses_np = np.array(all_2d_poses).reshape(
+            (len(all_2d_poses), 15,
+             2))  # 15: 2d keypoints number, 2: keypoint coordinate (x, y)
+        all_boxes_np = np.array(all_boxes_with_socre).reshape(
+            (len(all_boxes_with_socre), 5))  # [x1, y1, x2, y2, score]
+
+        kps_2d_h36m_17 = convert_2_h36m_data(
+            all_2d_poses_np,
+            all_boxes_np,
+            joints_nbr=self.keypoint_model_3d.cfg.model.MODEL.IN_NUM_JOINTS)
+        kps_2d_h36m_17 = np.array(kps_2d_h36m_17)
+        res = {'success': True, 'input_2d_pts': kps_2d_h36m_17}
+        return res
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not input['success']:
+            res = {'success': False, 'msg': 'preprocess failed.'}
+            return res
+
+        input_2d_pts = input['input_2d_pts']
+        outputs = self.keypoint_model_3d.preprocess(input_2d_pts)
+        outputs = self.keypoint_model_3d.forward(outputs)
+        res = dict({'success': True}, **outputs)
+        return res
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        res = {OutputKeys.POSES: []}
+
+        if not input['success']:
+            pass
+        else:
+            poses = input[KeypointsTypes.POSES_CAMERA]
+            res = {OutputKeys.POSES: poses.data.cpu().numpy()}
+        return res
+
+    def read_video_frames(self, video_url: Union[str, cv2.VideoCapture]):
+        """Read video from local video file or from a video stream URL.
+
+        Args:
+            video_url (str or cv2.VideoCapture): Video path or video stream.
+
+        Raises:
+            Exception: Open video fail.
+
+        Returns:
+            [nd.array]: List of video frames.
+        """
+        frames = []
+        if isinstance(video_url, str):
+            cap = cv2.VideoCapture(video_url)
+            if not cap.isOpened():
+                raise Exception(
+                    'modelscope error: %s cannot be decoded by OpenCV.' %
+                    (video_url))
+        else:
+            cap = video_url
+
+        max_frame_num = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME
+        frame_idx = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_idx += 1
+            frames.append(frame)
+            if frame_idx >= max_frame_num:
+                break
+        cap.release()
+        return frames
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 52c08594..2141a012 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -24,6 +24,7 @@ class CVTasks(object):
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
     general_recognition = 'general-recognition'
 
     image_classification = 'image-classification'
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
new file mode 100644
index 00000000..50426414
--- /dev/null
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import pdb
+import unittest
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class Body3DKeypointsTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_canonical_body-3d-keypoints_video'
+        self.test_video = 'data/test/videos/Walking.54138969.mp4'
+
+    def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
+        output = pipeline(pipeline_input)
+        poses = np.array(output[OutputKeys.POSES])
+        print(f'result 3d points shape {poses.shape}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_file(self):
+        body_3d_keypoints = pipeline(
+            Tasks.body_3d_keypoints, model=self.model_id)
+        self.pipeline_inference(body_3d_keypoints, self.test_video)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_stream(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        cap = cv2.VideoCapture(self.test_video)
+        if not cap.isOpened():
+            raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
+                            % (self.test_video))
+        self.pipeline_inference(body_3d_keypoints, cap)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        self.pipeline_inference(body_3d_keypoints, self.test_video)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8c9348de2cae660875cc6a728bea64e56eb0c73f Mon Sep 17 00:00:00 2001
From: "eniac.xcw" <eniac.xcw@alibaba-inc.com>
Date: Tue, 30 Aug 2022 10:07:39 +0800
Subject: [PATCH 018/175] [to #42322933]add team model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9908976

---
 modelscope/metainfo.py                        |   2 +
 modelscope/models/multi_modal/__init__.py     |   2 +
 .../models/multi_modal/team/__init__.py       |   1 +
 .../models/multi_modal/team/team_model.py     | 126 +++++++
 modelscope/models/multi_modal/team/utils.py   | 326 ++++++++++++++++++
 modelscope/outputs.py                         |   9 +
 modelscope/pipelines/builder.py               |   3 +
 .../team_multi_modal_similarity_pipeline.py   |  31 ++
 modelscope/utils/constant.py                  |   1 +
 .../pipelines/test_multi_modal_similarity.py  |  42 +++
 10 files changed, 543 insertions(+)
 create mode 100644 modelscope/models/multi_modal/team/__init__.py
 create mode 100644 modelscope/models/multi_modal/team/team_model.py
 create mode 100644 modelscope/models/multi_modal/team/utils.py
 create mode 100644 modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
 create mode 100644 tests/pipelines/test_multi_modal_similarity.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index d9e53ca7..58fd4f46 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -59,6 +59,7 @@ class Models(object):
     gemm = 'gemm-generative-multi-modal'
     mplug = 'mplug'
     diffusion = 'diffusion-text-to-image-synthesis'
+    team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
 
@@ -166,6 +167,7 @@ class Pipelines(object):
     visual_question_answering = 'visual-question-answering'
     visual_grounding = 'visual-grounding'
     visual_entailment = 'visual-entailment'
+    multi_modal_similarity = 'multi-modal-similarity'
     text_to_image_synthesis = 'text-to-image-synthesis'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
 
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 112b3a58..9219a281 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
 
     from .clip import CLIPForMultiModalEmbedding
     from .gemm import GEMMForMultiModalEmbedding
+    from .team import TEAMForMultiModalSimilarity
     from .diffusion import DiffusionForTextToImageSynthesis
     from .mmr import VideoCLIPForMultiModalEmbedding
     from .mplug_for_all_tasks import MPlugForAllTasks
@@ -19,6 +20,7 @@ else:
         'clip': ['CLIPForMultiModalEmbedding'],
         'diffusion': ['DiffusionForTextToImageSynthesis'],
         'gemm': ['GEMMForMultiModalEmbedding'],
+        'team': ['TEAMForMultiModalSimilarity'],
         'mmr': ['VideoCLIPForMultiModalEmbedding'],
         'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
diff --git a/modelscope/models/multi_modal/team/__init__.py b/modelscope/models/multi_modal/team/__init__.py
new file mode 100644
index 00000000..0597040c
--- /dev/null
+++ b/modelscope/models/multi_modal/team/__init__.py
@@ -0,0 +1 @@
+from .team_model import TEAMForMultiModalSimilarity
diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py
new file mode 100644
index 00000000..4aa77e17
--- /dev/null
+++ b/modelscope/models/multi_modal/team/team_model.py
@@ -0,0 +1,126 @@
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from tokenizers import BertWordPieceTokenizer
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .utils import TEAM, BertWrapper, CLIPVisionWrapper, CrossLayer
+
+logger = get_logger()
+
+__all__ = ['TEAMForMultiModalSimilarity']
+
+
+@MODELS.register_module(Tasks.multi_modal_similarity, module_name=Models.team)
+class TEAMForMultiModalSimilarity(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        text_model = BertWrapper(
+            config_json='{}/text_config.json'.format(model_dir),
+            feat_dim=768,
+            token_dim=1024)
+        text_model.bert.cls = None
+        image_model = CLIPVisionWrapper()
+
+        self.model = TEAM(
+            text_model,
+            image_model,
+            pretrained='{}/{}'.format(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+        self.text_tokenizer = BertWordPieceTokenizer(
+            '{}/{}'.format(model_dir, ModelFile.VOCAB_FILE), lowercase=False)
+        self.text_tokenizer.enable_truncation(max_length=30)
+
+        norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
+                            (0.26862954, 0.26130258, 0.27577711))
+        self.img_preprocessor = Compose([
+            Resize((224, 224), interpolation=Image.BICUBIC),
+            ToTensor(), norm_op
+        ])
+
+    def tokenize_text(self, text_str):
+        tokens = self.text_tokenizer.encode(text_str)
+        max_tokens = 30
+        text_ids_tensor = torch.zeros((1, max_tokens)).long()
+        text_mask_tensor = torch.zeros((1, max_tokens))
+        text_ids, text_mask = tokens.ids, tokens.attention_mask
+        text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
+        text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)
+        return text_ids_tensor, text_mask_tensor
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            if 'img' in input and input['img'] is not None:
+                input_img = input['img']
+                input_img = LoadImage.convert_to_img(input_img)
+                img_tensor = self.img_preprocessor(input_img)[None, ...]
+
+                if self.device_id >= 0:
+                    img_tensor = img_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                _, _, image_feature, image_tensors = self.model.get_feature(
+                    None, None, img_tensor)
+                image_feature = image_feature.cpu().numpy()
+            else:
+                image_feature, image_tensors = None, None
+
+            if 'text' in input and input['text'] is not None:
+                text_str = input['text']
+                if isinstance(text_str, str):
+                    text_ids_tensor, text_mask_tensor = self.tokenize_text(
+                        text_str)
+                else:
+                    raise TypeError(
+                        f'text should be str, but got {type(text_str)}')
+
+                if self.device_id >= 0:
+                    text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                    text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                text_feature, text_tensors, _, _ = self.model.get_feature(
+                    text_ids_tensor, text_mask_tensor, None)
+                text_feature = text_feature.cpu().numpy()
+            else:
+                text_tensors, text_mask_tensor = None, None
+
+            if text_tensors is not None and text_mask_tensor is not None and image_tensors is not None:
+                score = self.model.get_cross_score(text_tensors,
+                                                   text_mask_tensor,
+                                                   image_tensors)[0].item()
+            else:
+                score = None
+            output = {
+                OutputKeys.IMG_EMBEDDING: image_feature,
+                OutputKeys.TEXT_EMBEDDING: text_feature,
+                OutputKeys.SCORES: score
+            }
+            return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py
new file mode 100644
index 00000000..3b3e394e
--- /dev/null
+++ b/modelscope/models/multi_modal/team/utils.py
@@ -0,0 +1,326 @@
+""" Generative Multimodal Model
+Base Transformer code is adapted from https://github.com/openai/CLIP/,
+originally MIT License, Copyright (c) 2021 OpenAI,
+"""
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+from transformers import BertConfig, BertForMaskedLM
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 use_gc=False):
+        super().__init__()
+        self.use_gc = use_gc
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        if self.use_gc:
+            for each_block in self.resblocks:
+                x = checkpoint.checkpoint(each_block, x)
+            return x
+        else:
+            return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 output_dim: int,
+                 use_gc=False):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads, use_gc=use_gc)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIPVisionWrapper(nn.Module):
+
+    def __init__(self, ):
+        super().__init__()
+        self.vision_transformer = VisionTransformer(
+            input_resolution=224,
+            patch_size=14,
+            width=1024,
+            layers=24,
+            heads=16,
+            output_dim=768)
+
+    def forward(self, x):
+        x = self.vision_transformer.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.vision_transformer.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.vision_transformer.positional_embedding.to(x.dtype)
+        x = self.vision_transformer.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.vision_transformer.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x_tensor = x.clone()
+        x = self.vision_transformer.ln_post(x[:, 0, :])
+
+        if self.vision_transformer.proj is not None:
+            x = x @ self.vision_transformer.proj
+
+        return x, x_tensor
+
+
+class BertWrapper(nn.Module):
+
+    def __init__(self, config_json, feat_dim, token_dim):
+        super(BertWrapper, self).__init__()
+        bert_config = BertConfig.from_json_file(config_json)
+        self.bert = BertForMaskedLM(bert_config).bert
+
+        self.projector = nn.Linear(768, feat_dim, bias=False)
+        self.projector_token_embeds = nn.Linear(768, token_dim)
+
+    def forward(self, input_ids, attention_mask):
+        trans_features = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask
+        }
+        output_states = self.bert(**trans_features, return_dict=False)
+        output_tokens = output_states[0]
+
+        cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
+
+        return self.projector(cls_tokens), self.projector_token_embeds(
+            output_tokens)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class CrossLayer(nn.Module):
+
+    def __init__(self, feat_dim, mlp_ratio):
+        super(CrossLayer, self).__init__()
+        self.norm1 = nn.LayerNorm(feat_dim)
+        self.norm2 = nn.LayerNorm(feat_dim)
+        self.norm3 = nn.LayerNorm(feat_dim)
+
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.ffn = Mlp(
+            in_features=feat_dim,
+            hidden_features=feat_dim * mlp_ratio,
+            drop=0.1)
+
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.dropout3 = nn.Dropout(0.1)
+
+    def forward(self, text_tensors, text_masks, image_tensors,
+                retrieved_tensors):
+        retrieved_tensors_res = self.norm1(retrieved_tensors)
+        retrieved_tensors_res = self.self_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            retrieved_tensors_res.permute(1, 0, 2),
+            key_padding_mask=(text_masks == 0),
+        )[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout1(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm2(retrieved_tensors)
+        retrieved_tensors_res = self.cross_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2))[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout2(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm3(retrieved_tensors)
+        retrieved_tensors = retrieved_tensors + self.dropout3(
+            self.ffn(retrieved_tensors_res))
+
+        return retrieved_tensors
+
+
+class TEAM(nn.Module):
+
+    def __init__(self, text_model, image_model, pretrained):
+        super(TEAM, self).__init__()
+        self.text_model = text_model
+        self.image_model = image_model
+
+        self.cross_model = nn.ModuleList(
+            [CrossLayer(feat_dim=1024, mlp_ratio=2)])
+
+        self.image_tensor_fc = nn.Linear(1024, 768)
+        self.text_tensor_fc = nn.Linear(1024, 768)
+
+        params = torch.load(pretrained, 'cpu')
+        self.load_state_dict(params, strict=True)
+
+    def get_feature(self, text_data=None, text_mask=None, img_tensor=None):
+        if text_data is not None:
+            text_feature, text_tensors = self.text_model(text_data, text_mask)
+            text_feature = F.normalize(text_feature, p=2.0, dim=1)
+        else:
+            text_feature, text_tensors = None, None
+
+        if img_tensor is not None:
+            image_feature, image_tensors = self.image_model(img_tensor)
+            image_feature = F.normalize(image_feature, p=2.0, dim=1)
+        else:
+            image_feature, image_tensors = None, None
+
+        return text_feature, text_tensors, image_feature, image_tensors
+
+    def get_cross_score(self, text_tensors, text_mask, image_tensors):
+        retrieved_tensors = torch.zeros_like(text_tensors)
+        pair_score_list = []
+        text_tensors_proj = self.text_tensor_fc(text_tensors)
+        text_mask_float = text_mask.type(text_tensors_proj.dtype)
+        for each_cross_model in self.cross_model:
+            retrieved_tensors = each_cross_model(text_tensors, text_mask,
+                                                 image_tensors,
+                                                 retrieved_tensors)
+            retrieved_tensors_proj = self.image_tensor_fc(retrieved_tensors)
+
+            pair_score = torch.sum(
+                F.normalize(retrieved_tensors_proj, p=2.0, dim=2)
+                * F.normalize(text_tensors_proj, p=2.0, dim=2),
+                dim=2)
+            pair_score_reduced = torch.sum(
+                pair_score * text_mask_float, dim=1) / torch.clamp(
+                    torch.sum(text_mask_float, dim=1), min=1.0)
+            pair_score_list.append(pair_score_reduced)
+        return pair_score_list
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 622d9034..1c42a5f3 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -499,6 +499,15 @@ TASK_OUTPUTS = {
     Tasks.generative_multi_modal_embedding:
     [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION],
 
+    # multi-modal similarity result for single sample
+    # {
+    #   "img_embedding": np.array with shape [1, D],
+    #   "text_embedding": np.array with shape [1, D],
+    #   "similarity": float
+    # }
+    Tasks.multi_modal_similarity:
+    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+
     # VQA result for a sample
     # {"text": "this is a text answser. "}
     Tasks.visual_question_answering: [OutputKeys.TEXT],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f8f679e6..53f55b06 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -79,6 +79,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     (Pipelines.generative_multi_modal_embedding,
      'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
      ),
+    Tasks.multi_modal_similarity:
+    (Pipelines.multi_modal_similarity,
+     'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'),
     Tasks.visual_question_answering:
     (Pipelines.visual_question_answering,
      'damo/mplug_visual-question-answering_coco_large_en'),
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
new file mode 100644
index 00000000..7d3ffed3
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -0,0 +1,31 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.multi_modal_similarity, module_name=Pipelines.multi_modal_similarity)
+class TEAMMultiModalSimilarityPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a multimodal similarity pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return self.model(input)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2141a012..6d419a7e 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -117,6 +117,7 @@ class MultiModalTasks(object):
     text_to_image_synthesis = 'text-to-image-synthesis'
     multi_modal_embedding = 'multi-modal-embedding'
     generative_multi_modal_embedding = 'generative-multi-modal-embedding'
+    multi_modal_similarity = 'multi-modal-similarity'
     visual_question_answering = 'visual-question-answering'
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
new file mode 100644
index 00000000..d1d6a7a8
--- /dev/null
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiModalSimilarityTest(unittest.TestCase):
+    model_id = 'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'
+    test_input = {
+        'img': 'data/test/images/generative_multimodal.jpg',
+        'text': '起居室照片'
+    }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        multi_modal_similarity_pipeline = pipeline(
+            Tasks.multi_modal_similarity, model=self.model_id)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity, model=model)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3e66244c0d9e6bd052b77623ae96c3c0eb64e005 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 30 Aug 2022 10:23:22 +0800
Subject: [PATCH 019/175] [to #42322933] Add ANS trainer         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9892528

---
 modelscope/metainfo.py                     |  4 ++
 modelscope/metrics/__init__.py             |  2 +
 modelscope/metrics/audio_noise_metric.py   | 38 +++++++++++++++
 modelscope/metrics/builder.py              |  1 +
 modelscope/models/audio/ans/frcrn.py       | 54 +++++++++++---------
 modelscope/pipelines/audio/ans_pipeline.py | 19 ++------
 modelscope/trainers/__init__.py            |  2 +
 modelscope/trainers/audio/__init__.py      |  0
 modelscope/trainers/audio/ans_trainer.py   | 57 ++++++++++++++++++++++
 modelscope/utils/audio/audio_utils.py      | 35 +++++++++++++
 tests/trainers/audio/__init__.py           |  0
 tests/trainers/audio/test_ans_trainer.py   | 56 +++++++++++++++++++++
 12 files changed, 232 insertions(+), 36 deletions(-)
 create mode 100644 modelscope/metrics/audio_noise_metric.py
 create mode 100644 modelscope/trainers/audio/__init__.py
 create mode 100644 modelscope/trainers/audio/ans_trainer.py
 create mode 100644 modelscope/utils/audio/audio_utils.py
 create mode 100644 tests/trainers/audio/__init__.py
 create mode 100644 tests/trainers/audio/test_ans_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 58fd4f46..eab870ae 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -198,6 +198,9 @@ class Trainers(object):
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
 
+    # audio trainers
+    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+
 
 class Preprocessors(object):
     """ Names for different preprocessor.
@@ -254,6 +257,7 @@ class Metrics(object):
 
     # accuracy
     accuracy = 'accuracy'
+    audio_noise_metric = 'audio-noise-metric'
 
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index d307f7c9..c74b475e 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio_noise_metric import AudioNoiseMetric
     from .base import Metric
     from .builder import METRICS, build_metric, task_default_metrics
     from .image_color_enhance_metric import ImageColorEnhanceMetric
@@ -18,6 +19,7 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'audio_noise_metric': ['AudioNoiseMetric'],
         'base': ['Metric'],
         'builder': ['METRICS', 'build_metric', 'task_default_metrics'],
         'image_color_enhance_metric': ['ImageColorEnhanceMetric'],
diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
new file mode 100644
index 00000000..16c5261f
--- /dev/null
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -0,0 +1,38 @@
+from typing import Dict
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.audio_noise_metric)
+class AudioNoiseMetric(Metric):
+    """
+    The metric computation class for acoustic noise suppression task.
+    """
+
+    def __init__(self):
+        self.loss = []
+        self.amp_loss = []
+        self.phase_loss = []
+        self.sisnr = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.loss.append(outputs['loss'].data.cpu())
+        self.amp_loss.append(outputs['amp_loss'].data.cpu())
+        self.phase_loss.append(outputs['phase_loss'].data.cpu())
+        self.sisnr.append(outputs['sisnr'].data.cpu())
+
+    def evaluate(self):
+        avg_loss = sum(self.loss) / len(self.loss)
+        avg_sisnr = sum(self.sisnr) / len(self.sisnr)
+        avg_amp = sum(self.amp_loss) / len(self.amp_loss)
+        avg_phase = sum(self.phase_loss) / len(self.phase_loss)
+        total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
+        return {
+            'total_loss': total_loss.item(),
+            'avg_sisnr': avg_sisnr.item(),
+            MetricKeys.AVERAGE_LOSS: avg_loss.item()
+        }
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 9ba80a6c..869a1ab2 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -16,6 +16,7 @@ class MetricKeys(object):
     RECALL = 'recall'
     PSNR = 'psnr'
     SSIM = 'ssim'
+    AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
 
 
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
index ba78ab74..59411fbe 100644
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -71,32 +71,41 @@ class FRCRNModel(TorchModel):
             model_dir (str): the model path.
         """
         super().__init__(model_dir, *args, **kwargs)
-        kwargs.pop('device')
         self.model = FRCRN(*args, **kwargs)
         model_bin_file = os.path.join(model_dir,
                                       ModelFile.TORCH_MODEL_BIN_FILE)
         if os.path.exists(model_bin_file):
-            checkpoint = torch.load(model_bin_file)
-            self.model.load_state_dict(checkpoint, strict=False)
+            checkpoint = torch.load(
+                model_bin_file, map_location=torch.device('cpu'))
+            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+                self.model.load_state_dict(
+                    checkpoint['state_dict'], strict=False)
+            else:
+                self.model.load_state_dict(checkpoint, strict=False)
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        output = self.model.forward(input)
-        return {
-            'spec_l1': output[0],
-            'wav_l1': output[1],
-            'mask_l1': output[2],
-            'spec_l2': output[3],
-            'wav_l2': output[4],
-            'mask_l2': output[5]
+        result_list = self.model.forward(input['noisy'])
+        output = {
+            'spec_l1': result_list[0],
+            'wav_l1': result_list[1],
+            'mask_l1': result_list[2],
+            'spec_l2': result_list[3],
+            'wav_l2': result_list[4],
+            'mask_l2': result_list[5]
         }
-
-    def to(self, *args, **kwargs):
-        self.model = self.model.to(*args, **kwargs)
-        return self
-
-    def eval(self):
-        self.model = self.model.train(False)
-        return self
+        if 'clean' in input:
+            mix_result = self.model.loss(
+                input['noisy'], input['clean'], result_list, mode='Mix')
+            output.update(mix_result)
+            sisnr_result = self.model.loss(
+                input['noisy'], input['clean'], result_list, mode='SiSNR')
+            output.update(sisnr_result)
+            # logger hooker will use items under 'log_vars'
+            output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
+            output['log_vars'].update(
+                {k: sisnr_result[k].item()
+                 for k in sisnr_result})
+        return output
 
 
 class FRCRN(nn.Module):
@@ -111,7 +120,8 @@ class FRCRN(nn.Module):
                  win_len=400,
                  win_inc=100,
                  fft_len=512,
-                 win_type='hanning'):
+                 win_type='hanning',
+                 **kwargs):
         r"""
         Args:
             complex: Whether to use complex networks.
@@ -237,7 +247,7 @@ class FRCRN(nn.Module):
                 if count != 3:
                     loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
                                             est_mask, mode)
-            return loss
+            return dict(sisnr=loss)
 
         elif mode == 'Mix':
             count = 0
@@ -252,7 +262,7 @@ class FRCRN(nn.Module):
                     amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
                         noisy, est_spec, est_wav, labels, est_mask, mode)
                     loss = amp_loss + phase_loss + SiSNR_loss
-            return loss, amp_loss, phase_loss
+            return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss)
 
     def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
         r""" Compute the loss by mode
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 410a7cb5..5ed4d769 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -10,21 +10,10 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import audio_norm
 from modelscope.utils.constant import Tasks
 
 
-def audio_norm(x):
-    rms = (x**2).mean()**0.5
-    scalar = 10**(-25 / 20) / rms
-    x = x * scalar
-    pow_x = x**2
-    avg_pow_x = pow_x.mean()
-    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
-    scalarx = 10**(-25 / 20) / rmsx
-    x = x * scalarx
-    return x
-
-
 @PIPELINES.register_module(
     Tasks.acoustic_noise_suppression,
     module_name=Pipelines.speech_frcrn_ans_cirm_16k)
@@ -98,7 +87,8 @@ class ANSPipeline(Pipeline):
                 current_idx = 0
                 while current_idx + window <= t:
                     print('current_idx: {}'.format(current_idx))
-                    tmp_input = ndarray[:, current_idx:current_idx + window]
+                    tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+                                                   + window])
                     tmp_output = self.model(
                         tmp_input, )['wav_l2'][0].cpu().numpy()
                     end_index = current_idx + window - give_up_length
@@ -111,7 +101,8 @@ class ANSPipeline(Pipeline):
                                     give_up_length:-give_up_length]
                     current_idx += stride
             else:
-                outputs = self.model(ndarray)['wav_l2'][0].cpu().numpy()
+                outputs = self.model(
+                    dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
         outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
         return {OutputKeys.OUTPUT_PCM: outputs}
 
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 17ed7f3c..32ff674f 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio.ans_trainer import ANSTrainer
     from .base import DummyTrainer
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'audio.ans_trainer': ['ANSTrainer'],
         'base': ['DummyTrainer'],
         'builder': ['build_trainer'],
         'cv': [
diff --git a/modelscope/trainers/audio/__init__.py b/modelscope/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py
new file mode 100644
index 00000000..f782b836
--- /dev/null
+++ b/modelscope/trainers/audio/ans_trainer.py
@@ -0,0 +1,57 @@
+import time
+from typing import List, Optional, Union
+
+from datasets import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import TrainerStages
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_frcrn_ans_cirm_16k)
+class ANSTrainer(EpochBasedTrainer):
+    """
+    A trainer is used for acoustic noise suppression.
+    Override train_loop() to use dataset just one time.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train_loop(self, data_loader):
+        """
+        Update epoch by step number, based on super method.
+        """
+        self.invoke_hook(TrainerStages.before_run)
+        self._epoch = 0
+        kwargs = {}
+        self.model.train()
+        enumerated = enumerate(data_loader)
+        for _ in range(self._epoch, self._max_epochs):
+            self.invoke_hook(TrainerStages.before_train_epoch)
+            self._inner_iter = 0
+            for i, data_batch in enumerated:
+                data_batch = to_device(data_batch, self.device)
+                self.data_batch = data_batch
+                self._inner_iter += 1
+                self.invoke_hook(TrainerStages.before_train_iter)
+                self.train_step(self.model, data_batch, **kwargs)
+                self.invoke_hook(TrainerStages.after_train_iter)
+                del self.data_batch
+                self._iter += 1
+                if self._inner_iter >= self.iters_per_epoch:
+                    break
+
+            self.invoke_hook(TrainerStages.after_train_epoch)
+            self._epoch += 1
+
+        self.invoke_hook(TrainerStages.after_run)
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
new file mode 100644
index 00000000..14374c65
--- /dev/null
+++ b/modelscope/utils/audio/audio_utils.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+SEGMENT_LENGTH_TRAIN = 16000
+
+
+def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN):
+    """
+    Dataset mapping function to split one audio into segments.
+    It only works in batch mode.
+    """
+    noisy_arrays = []
+    for x in batch['noisy']:
+        length = len(x['array'])
+        noisy = np.array(x['array'])
+        for offset in range(segment_length, length, segment_length):
+            noisy_arrays.append(noisy[offset - segment_length:offset])
+    clean_arrays = []
+    for x in batch['clean']:
+        length = len(x['array'])
+        clean = np.array(x['array'])
+        for offset in range(segment_length, length, segment_length):
+            clean_arrays.append(clean[offset - segment_length:offset])
+    return {'noisy': noisy_arrays, 'clean': clean_arrays}
+
+
+def audio_norm(x):
+    rms = (x**2).mean()**0.5
+    scalar = 10**(-25 / 20) / rms
+    x = x * scalar
+    pow_x = x**2
+    avg_pow_x = pow_x.mean()
+    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
+    scalarx = 10**(-25 / 20) / rmsx
+    x = x * scalarx
+    return x
diff --git a/tests/trainers/audio/__init__.py b/tests/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
new file mode 100644
index 00000000..176c811f
--- /dev/null
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -0,0 +1,56 @@
+import os
+import shutil
+import tempfile
+import unittest
+from functools import partial
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.test_utils import test_level
+
+SEGMENT_LENGTH_TEST = 640
+
+
+class TestANSTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
+
+        hf_ds = MsDataset.load(
+            'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
+        mapped_ds = hf_ds.map(
+            partial(to_segment, segment_length=SEGMENT_LENGTH_TEST),
+            remove_columns=['duration'],
+            batched=True,
+            batch_size=2)
+        self.dataset = MsDataset.from_hf_dataset(mapped_ds)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            model_revision='beta',
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            max_epochs=2,
+            train_iters_per_epoch=2,
+            val_iters_per_epoch=1,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)

From a9089570e540cf0ea3c3adee278109a49dd90d73 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 30 Aug 2022 10:50:52 +0800
Subject: [PATCH 020/175] [to #42322933] Add mplug retrieval pipeline and
 finetune
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

支持 MPLUG 模型 image-text-retrieval 任务的 pipeline 和 finetune
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9919955
---
 data/test/images/image-text-retrieval.jpg     |   3 +
 modelscope/metainfo.py                        |   1 +
 .../multi_modal/mplug/configuration_mplug.py  |   8 +
 .../multi_modal/mplug/modeling_mplug.py       | 320 ++++++++++++++++--
 .../models/multi_modal/mplug_for_all_tasks.py |  68 ++--
 .../image_text_retrieval_pipeline.py          |  51 +++
 modelscope/preprocessors/multi_modal.py       |  36 +-
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_mplug_tasks.py           |  21 ++
 tests/trainers/test_finetune_mplug.py         |  71 ++--
 10 files changed, 488 insertions(+), 92 deletions(-)
 create mode 100644 data/test/images/image-text-retrieval.jpg
 create mode 100644 modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py

diff --git a/data/test/images/image-text-retrieval.jpg b/data/test/images/image-text-retrieval.jpg
new file mode 100644
index 00000000..2d20374a
--- /dev/null
+++ b/data/test/images/image-text-retrieval.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab
+size 218143
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index eab870ae..b4d005a7 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -170,6 +170,7 @@ class Pipelines(object):
     multi_modal_similarity = 'multi-modal-similarity'
     text_to_image_synthesis = 'text-to-image-synthesis'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
 
 
 class Trainers(object):
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index c275ed15..914678c5 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -64,6 +64,10 @@ class MPlugConfig(PretrainedConfig):
             clip_transformer_width=768,
             clip_transformer_heads=12,
             clip_transformer_layers=12,
+            # retrieval
+            queue_size=65536,
+            embed_dim=256,
+            temp=0.07,
             **kwargs):
 
         super().__init__(**kwargs)
@@ -99,6 +103,10 @@ class MPlugConfig(PretrainedConfig):
         self.clip_transformer_width = clip_transformer_width
         self.clip_transformer_heads = clip_transformer_heads
         self.clip_transformer_layers = clip_transformer_layers
+        # retrieval
+        self.queue_size = queue_size
+        self.embed_dim = embed_dim
+        self.temp = temp
 
     @classmethod
     def from_yaml_file(cls, yaml_file: Union[str,
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 6311bd31..78f60f9b 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1855,7 +1855,8 @@ class MPlug(PreTrainedModel):
 
         task_mapping = {
             Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
-            Tasks.image_captioning: MPLUGForImageCaption
+            Tasks.image_captioning: MPlugForImageCaption,
+            Tasks.image_text_retrieval: MPlugForImageTextRetrieval,
         }
         config = cls.config_class.from_yaml_file(
             os.path.join(model_dir, CONFIG_NAME))
@@ -1915,6 +1916,33 @@ class MPlug(PreTrainedModel):
         clip_model.visual.positional_embedding = pos_embed
         return clip_model
 
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
     def forward(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -1978,33 +2006,6 @@ class MPlugForVisualQuestionAnswering(MPlug):
         self.beam_generator = TextGenerator(config, self.text_decoder)
         self.init_distill(config)
 
-    def init_distill(self, config):
-        self.distill = config.distill
-        if self.distill:
-            self.visual_encoder_m = self._initialize_clip(config)
-            self.text_encoder_m = BertModel(
-                self.config_encoder, add_pooling_layer=False)
-            self.fusion_encoder_m = FusionModel(
-                self.config_fusion, add_pooling_layer=False)
-            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
-            self.model_pairs = [
-                [self.visual_encoder, self.visual_encoder_m],
-                [self.text_encoder, self.text_encoder_m],
-                [self.text_decoder, self.text_decoder_m],
-            ]
-            if self.config_encoder.hidden_size != config.vision_width:
-                self.visn_fc_m = nn.Linear(config.vision_width,
-                                           self.config_encoder.hidden_size)
-                self.visn_layer_norm_m = nn.LayerNorm(
-                    self.config_encoder.hidden_size, eps=1e-12)
-                self.dropout_m = nn.Dropout(
-                    self.config_encoder.hidden_dropout_prob)
-                self.model_pairs.extend(
-                    [[self.visn_fc, self.visn_fc_m],
-                     [self.visn_layer_norm, self.visn_layer_norm_m]])
-            self.copy_params()
-            self.momentum = 0.995
-
     def forward(self,
                 image,
                 question,
@@ -2142,7 +2143,7 @@ class MPlugForVisualQuestionAnswering(MPlug):
             return topk_ids, topk_probs
 
 
-class MPLUGForImageCaption(MPlug):
+class MPlugForImageCaption(MPlug):
 
     def __init__(self, config):
         super().__init__(config)
@@ -2215,3 +2216,264 @@ class MPLUGForImageCaption(MPlug):
         else:
             topk_ids, topk_probs = self.generation(image_embeds, image_atts)
             return topk_ids, topk_probs
+
+
+class MPlugForImageTextRetrieval(MPlug):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.embed_dim
+        self.temp = nn.Parameter(torch.ones([]) * config.temp)
+        self.queue_size = config.queue_size
+        self.momentum = config.momentum
+        self.alpha = config.alpha
+
+        self.queue_size = config.queue_size
+        self.text_width = self.config_encoder.hidden_size
+        self.embed_dim = config.embed_dim
+
+        self.vision_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.itm_head = nn.Linear(self.text_width, 2)
+
+        self.register_buffer('image_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('text_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('idx_queue', torch.full((1, self.queue_size),
+                                                     -100))
+        self.register_buffer('queue_ptr', torch.zeros(1, dtype=torch.long))
+
+        self.image_queue = F.normalize(self.image_queue, dim=0)
+        self.text_queue = F.normalize(self.text_queue, dim=0)
+        self.init_distill(config)
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.vision_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.text_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_proj, self.text_proj_m],
+                [self.vision_proj, self.vision_proj_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat, idx):
+
+        def concat_all_gather(tensor):
+            """
+            Performs all_gather operation on the provided tensors.
+            *** Warning ***: torch.distributed.all_gather has no gradient.
+            """
+            if not torch.distributed.is_initialized():
+                return tensor
+            tensors_gather = [
+                torch.ones_like(tensor)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            torch.distributed.all_gather(
+                tensors_gather, tensor, async_op=False)
+
+            output = torch.cat(tensors_gather, dim=0)
+            return output
+
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+        idxs = concat_all_gather(idx)
+
+        batch_size = image_feats.shape[0]
+
+        ptr = int(self.queue_ptr)
+        # assert self.queue_size % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
+        self.idx_queue[:, ptr:ptr + batch_size] = idxs.T
+        ptr = (ptr + batch_size) % self.queue_size  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    def forward(self, image, text, idx=None, train=True):
+        if train:
+            image_embeds = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            if self.large:
+                image_embeds = self.dropout(
+                    self.visn_layer_norm(self.visn_fc(image_embeds)))
+            image_atts = torch.ones(
+                image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+            image_feat = F.normalize(
+                self.vision_proj(image_embeds[:, 0, :]), dim=-1)
+            text_output = self.text_encoder(
+                text.input_ids,
+                attention_mask=text.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            text_feat = F.normalize(
+                self.text_proj(text_embeds[:, 0, :]), dim=-1)
+
+            idx = idx.view(-1, 1)
+            idx_all = torch.cat(
+                [idx.t(), self.idx_queue.clone().detach()], dim=1)
+            pos_idx = torch.eq(idx, idx_all).float()
+            sim_targets = pos_idx / pos_idx.sum(1, keepdim=True)
+
+            with torch.no_grad():
+                self._momentum_update()
+                image_embeds_m = self.visual_encoder_m.visual(
+                    image, skip_last_layer=True)
+                if self.large:
+                    image_embeds_m = self.dropout_m(
+                        self.visn_layer_norm_m(self.visn_fc_m(image_embeds_m)))
+                image_feat_m = F.normalize(
+                    self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1)
+                image_feat_all = torch.cat(
+                    [image_feat_m.t(),
+                     self.image_queue.clone().detach()],
+                    dim=1)
+                text_output_m = self.text_encoder_m(
+                    text.input_ids,
+                    attention_mask=text.attention_mask,
+                    return_dict=True)
+                text_feat_m = F.normalize(
+                    self.text_proj_m(text_output_m.last_hidden_state[:, 0, :]),
+                    dim=-1)
+                text_feat_all = torch.cat(
+                    [text_feat_m.t(),
+                     self.text_queue.clone().detach()], dim=1)
+
+                if self.distill:
+                    sim_i2t_m = image_feat_m @ text_feat_all / self.temp
+                    sim_t2i_m = text_feat_m @ image_feat_all / self.temp
+
+                    sim_i2t_targets = self.alpha * F.softmax(
+                        sim_i2t_m, dim=1) + (1 - self.alpha) * sim_targets
+                    sim_t2i_targets = self.alpha * F.softmax(
+                        sim_t2i_m, dim=1) + (1 - self.alpha) * sim_targets
+
+            sim_i2t = image_feat @ text_feat_all / self.temp
+            sim_t2i = text_feat @ image_feat_all / self.temp
+
+            if self.distill:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets,
+                    dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets,
+                    dim=1).mean()
+            else:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
+
+            loss_ita = (loss_i2t + loss_t2i) / 2
+
+            self._dequeue_and_enqueue(image_feat_m, text_feat_m, idx)
+
+            # forward the positve image-text pair
+            _, output_pos = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=False,
+            )
+            with torch.no_grad():
+                bs = image.size(0)
+                weights_i2t = F.softmax(sim_i2t[:, :bs], dim=1)
+                weights_t2i = F.softmax(sim_t2i[:, :bs], dim=1)
+
+                mask = torch.eq(idx, idx.T)
+                weights_i2t.masked_fill_(mask, 0)
+                weights_t2i.masked_fill_(mask, 0)
+
+            # select a negative image for each text
+            image_embeds_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+                image_embeds_neg.append(image_embeds[neg_idx])
+            image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
+
+            # select a negative text for each image
+            text_embeds_neg = []
+            text_atts_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+                text_embeds_neg.append(text_embeds[neg_idx])
+                text_atts_neg.append(text.attention_mask[neg_idx])
+            text_embeds_neg = torch.stack(text_embeds_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+
+            text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
+            text_atts_all = torch.cat([text.attention_mask, text_atts_neg],
+                                      dim=0)
+
+            image_embeds_all = torch.cat([image_embeds_neg, image_embeds],
+                                         dim=0)
+            image_atts_all = torch.cat([image_atts, image_atts], dim=0)
+
+            _, output_neg = self.fusion_encoder(
+                encoder_embeds=text_embeds_all,
+                attention_mask=text_atts_all,
+                encoder_hidden_states=image_embeds_all,
+                encoder_attention_mask=image_atts_all,
+                return_dict=False,
+            )
+
+            vl_embeddings = torch.cat(
+                [output_pos[:, 0, :], output_neg[:, 0, :]], dim=0)
+            vl_output = self.itm_head(vl_embeddings)
+
+            ones_tmp = torch.ones(bs, dtype=torch.long)
+            zeros_tmp = torch.zeros(2 * bs, dtype=torch.long)
+            itm_labels = torch.cat([ones_tmp, zeros_tmp],
+                                   dim=0).to(image.device)
+            loss_itm = F.cross_entropy(vl_output, itm_labels)
+
+            return loss_ita + loss_itm
+        else:
+            text_output = self.text_encoder(
+                text.input_ids, attention_mask=text.attention_mask)
+            text_feat = text_output.last_hidden_state
+            image_feat = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            image_feat = self.visn_layer_norm(self.visn_fc(image_feat))
+            image_att = torch.ones(
+                image_feat.size()[:-1],
+                dtype=torch.long,
+                device=image_feat.device)
+            _, output = self.fusion_encoder(
+                encoder_embeds=text_feat,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_feat,
+                encoder_attention_mask=image_att,
+                return_dict=False,
+            )
+            scores = self.itm_head(output[:, 0, :])
+            scores = F.softmax(scores, dim=-1)
+
+            return scores
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index fb460714..608cc733 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -12,6 +12,7 @@ __all__ = ['MPlugForAllTasks']
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.mplug)
 @MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
+@MODELS.register_module(Tasks.image_text_retrieval, module_name=Models.mplug)
 class MPlugForAllTasks(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -43,39 +44,50 @@ class MPlugForAllTasks(TorchModel):
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
-        if not self.training and 'answer_input_ids' not in input:
-            topk_ids, _ = self.model(**input)
+        # inference
+        if not self.training and 'question' in input:
+            output = self.model(input['image'], input['question'], train=False)
+            if not isinstance(output, tuple):
+                return output
+            topk_ids, _ = output
             pred_string: str = self.tokenizer.decode(topk_ids[0][0])
             for _old, _new in replace_tokens_bert:
                 pred_string = pred_string.replace(_old, _new)
             pred_string = pred_string.strip()
             return pred_string
-        else:
-            import addict
+
+        # train and evaluate
+        import addict
+        image = input['image']
+        answer = addict.Dict(
+            input_ids=input['answer_input_ids'],
+            attention_mask=input['answer_attention_mask'])
+        if 'index' not in input:
             question = addict.Dict(
                 input_ids=input['question_input_ids'],
                 attention_mask=input['question_attention_mask'])
-            answer = addict.Dict(
-                input_ids=input['answer_input_ids'],
-                attention_mask=input['answer_attention_mask'])
-            output = self.model(
-                input['image'], question, answer, train=self.training)
-            if self.training:
-                return {'loss': output}
-            topk_ids, _ = output
-            preds: List[str] = [
-                self.tokenizer.decode(batch[0]) for batch in topk_ids
-            ]
-            for i in range(len(preds)):
-                for _old, _new in replace_tokens_bert:
-                    preds[i] = preds[i].replace(_old, _new)
-                preds[i] = preds[i].strip()
-            tgts: List[str] = [
-                self.tokenizer.decode(batch)
-                for batch in input['answer_input_ids'].cpu().numpy().tolist()
-            ]
-            for i in range(len(tgts)):
-                for _old, _new in replace_tokens_bert:
-                    tgts[i] = tgts[i].replace(_old, _new)
-                preds[i] = preds[i].strip()
-            return {'preds': preds, 'tgts': tgts}
+            output = self.model(image, question, answer, train=self.training)
+        else:
+            index = input['index']
+            output = self.model(image, answer, index, train=self.training)
+        if self.training:
+            return {'loss': output}
+
+        # evaluate
+        topk_ids, _ = output
+        preds: List[str] = [
+            self.tokenizer.decode(batch[0]) for batch in topk_ids
+        ]
+        for i in range(len(preds)):
+            for _old, _new in replace_tokens_bert:
+                preds[i] = preds[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        tgts: List[str] = [
+            self.tokenizer.decode(batch)
+            for batch in input['answer_input_ids'].cpu().numpy().tolist()
+        ]
+        for i in range(len(tgts)):
+            for _old, _new in replace_tokens_bert:
+                tgts[i] = tgts[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        return {'preds': preds, 'tgts': tgts}
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
new file mode 100644
index 00000000..1ebcf526
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import MPlugPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_text_retrieval, module_name=Pipelines.image_text_retrieval)
+class ImageTextRetrievalPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a
+        image text retrieval pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            f'model must be a single str or Model, but got {type(model)}'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None:
+            preprocessor = MPlugPreprocessor(pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.SCORES: inputs[0].tolist()}
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 4f0cb977..9873a62c 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from PIL import Image
@@ -104,6 +104,7 @@ class MPlugPreprocessor(Preprocessor):
 
         self._tokenizer = None
         self._patch_resize_transform = None
+        self._image_map = {}
 
     @property
     def tokenizer(self):
@@ -133,31 +134,31 @@ class MPlugPreprocessor(Preprocessor):
             ])
         return self._patch_resize_transform
 
-    def __call__(self, *args, **kwargs):
-        call_mapping = {
-            Tasks.visual_question_answering: self.image_text_call,
-            Tasks.image_captioning: self.image_text_call,
-        }
+    def image_open(self, path: str) -> Tuple[Image.Image, int]:
+        if path not in self._image_map:
+            index = len(self._image_map)
+            self._image_map[path] = (Image.open(path), index)
+        return self._image_map[path]
 
-        self.cfg = Config.from_file(
-            osp.join(self.model_dir, ModelFile.CONFIGURATION))
-        return call_mapping[self.cfg.task](*args, **kwargs)
-
-    def image_text_call(
+    def __call__(
             self, data: Union[Image.Image, tuple,
                               Dict[str, Any]]) -> Dict[str, Any]:
+        self.cfg = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION))
+
         if isinstance(data, (Image.Image, str)):
             image = data
         elif isinstance(data, tuple):
             image = data[0]
         else:
             image = data['image']
+        index = 0
         if isinstance(image, str):
-            image = Image.open(image)
-        question = '' if self.cfg.task != Tasks.visual_question_answering \
-            else data[1 if isinstance(data, tuple) else 'question']
+            image, index = self.image_open(image)
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
+        question = '' if self.cfg.task == Tasks.image_captioning \
+            else data[1 if isinstance(data, tuple) else 'question']
         question = self.tokenizer(
             question.lower(),
             padding='max_length',
@@ -167,7 +168,7 @@ class MPlugPreprocessor(Preprocessor):
 
         if self.mode == ModeKeys.INFERENCE:
             image = torch.stack([image], dim=0)
-            return {'image': image, 'question': question, 'train': False}
+            return {'image': image, 'question': question}
         else:
             answer = data['answer']
             answer = self.tokenizer(
@@ -176,10 +177,13 @@ class MPlugPreprocessor(Preprocessor):
                 truncation=True,
                 max_length=self.tokenizer_max_length,
                 return_tensors='pt')
-            return {
+            output = {
                 'image': image,
                 'question_input_ids': question.input_ids.squeeze(),
                 'question_attention_mask': question.attention_mask.squeeze(),
                 'answer_input_ids': answer.input_ids.squeeze(),
                 'answer_attention_mask': answer.attention_mask.squeeze(),
             }
+            if self.cfg.task == Tasks.image_text_retrieval:
+                output['index'] = index
+            return output
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 6d419a7e..66f734f9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -121,6 +121,7 @@ class MultiModalTasks(object):
     visual_question_answering = 'visual-question-answering'
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
 
 
 class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 4b8a813a..642ac11d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -54,6 +54,27 @@ class MplugTasksTest(unittest.TestCase):
         result = pipeline_vqa(input)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_model(self):
+        model = Model.from_pretrained(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'question': question}
+        result = pipeline_retrieval(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_name(self):
+        model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'question': question}
+        result = pipeline_retrieval(input)
+        print(result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index 5776141c..1298f1cd 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -4,8 +4,6 @@ import shutil
 import tempfile
 import unittest
 
-from PIL import Image
-
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.models.multi_modal import MPlugForAllTasks
@@ -23,7 +21,10 @@ class TestFinetuneMPlug(unittest.TestCase):
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
 
-        datadict = MsDataset.load('coco_captions_small_slice')
+        from modelscope.utils.constant import DownloadMode
+        datadict = MsDataset.load(
+            'coco_captions_small_slice',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
         self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
             lambda _: {
                 'question': 'what the picture describes?'
@@ -35,17 +36,19 @@ class TestFinetuneMPlug(unittest.TestCase):
             }).rename_column('image:FILE',
                              'image').rename_column('answer:Value', 'answer'))
 
+        self.max_epochs = 3
+
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_with_caption(self):
-
         kwargs = dict(
             model='damo/mplug_image-captioning_coco_base_en',
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
@@ -53,15 +56,11 @@ class TestFinetuneMPlug(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(3):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_caption_with_model_and_args(self):
-        tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
-
         cache_path = snapshot_download(
             'damo/mplug_image-captioning_coco_base_en')
         model = MPlugForAllTasks.from_pretrained(cache_path)
@@ -70,7 +69,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             model=model,
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
-            max_epochs=2,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
@@ -78,16 +77,16 @@ class TestFinetuneMPlug(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_with_vqa(self):
-
         kwargs = dict(
             model='damo/mplug_visual-question-answering_coco_large_en',
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
@@ -95,15 +94,11 @@ class TestFinetuneMPlug(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(3):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_vqa_with_model_and_args(self):
-        tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
-
         cache_path = snapshot_download(
             'damo/mplug_visual-question-answering_coco_large_en')
         model = MPlugForAllTasks.from_pretrained(cache_path)
@@ -112,7 +107,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             model=model,
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
-            max_epochs=2,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
@@ -120,7 +115,45 @@ class TestFinetuneMPlug(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_retrieval(self):
+        kwargs = dict(
+            model='damo/mplug_image-text-retrieval_flickr30k_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_retrieval_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
 
From 88d0804dcd6cdc430ea96ede82796a48a92596c2 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 30 Aug 2022 10:52:07 +0800
Subject: [PATCH 021/175] [to #42322933] Add S4: child-tuning

1. add child-tuning optimizer and ut
2. fix a training bug which can cause interruption after cross-evaluation
3. move model.params from cfg to default args in build_optimizer to prevent the saving of params in save_pretrained
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9891963
---
 modelscope/trainers/optimizer/__init__.py     |   3 +-
 modelscope/trainers/optimizer/builder.py      |   5 +-
 .../optimizer/child_tuning_adamw_optimizer.py | 188 ++++++++++++++++++
 modelscope/trainers/trainer.py                |   1 +
 .../test_finetune_sequence_classification.py  | 132 +++++++++++-
 .../test_finetune_token_classificatin.py      |   7 +-
 6 files changed, 331 insertions(+), 5 deletions(-)
 create mode 100644 modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py

diff --git a/modelscope/trainers/optimizer/__init__.py b/modelscope/trainers/optimizer/__init__.py
index 884f3043..9962c2c2 100644
--- a/modelscope/trainers/optimizer/__init__.py
+++ b/modelscope/trainers/optimizer/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .builder import OPTIMIZERS, build_optimizer
+from .child_tuning_adamw_optimizer import ChildTuningAdamW
 
-__all__ = ['OPTIMIZERS', 'build_optimizer']
+__all__ = ['OPTIMIZERS', 'build_optimizer', 'ChildTuningAdamW']
diff --git a/modelscope/trainers/optimizer/builder.py b/modelscope/trainers/optimizer/builder.py
index 4d772dd9..f43768d6 100644
--- a/modelscope/trainers/optimizer/builder.py
+++ b/modelscope/trainers/optimizer/builder.py
@@ -20,7 +20,10 @@ def build_optimizer(model: torch.nn.Module,
     """
     if hasattr(model, 'module'):
         model = model.module
-    cfg.params = model.parameters()
+
+    if default_args is None:
+        default_args = {}
+    default_args['params'] = model.parameters()
 
     return build_from_cfg(
         cfg, OPTIMIZERS, group_key=default_group, default_args=default_args)
diff --git a/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
new file mode 100644
index 00000000..d004071f
--- /dev/null
+++ b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
@@ -0,0 +1,188 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import types
+from typing import Callable, Iterable, Tuple
+
+import numpy as np
+import torch
+from torch.distributions.bernoulli import Bernoulli
+from torch.optim import Optimizer
+
+from modelscope.utils.logger import get_logger
+from .builder import OPTIMIZERS, default_group
+
+logger = get_logger(__name__)
+
+__all__ = ['calculate_fisher', 'ChildTuningAdamW']
+
+
+def calculate_fisher(model: torch.nn.Module,
+                     data_loader,
+                     forward_step,
+                     reserve_p,
+                     grad_clip=None):
+
+    gradient_mask = dict()
+    model.train()
+    for name, params in model.named_parameters():
+        if 'layer' in name:
+            gradient_mask[params] = params.new_zeros(params.size())
+
+    iters = len(data_loader)
+    for inputs in data_loader:
+        loss = forward_step(model, inputs)
+        loss.backward()
+        for name, params in model.named_parameters():
+            if 'layer' in name:
+                if grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(params, **grad_clip)
+                gradient_mask[params] += (params.grad**2) / iters
+        model.zero_grad()
+
+    logger.info('Calculate Fisher Information...')
+
+    # Numpy
+    r = None
+    for k, v in gradient_mask.items():
+        v = v.view(-1).cpu().numpy()
+        if r is None:
+            r = v
+        else:
+            r = np.append(r, v)
+    polar = np.percentile(r, (1 - reserve_p) * 100)
+    for k in gradient_mask:
+        gradient_mask[k] = gradient_mask[k] >= polar
+    print('Polar => {}'.format(polar))
+
+    # TODO: pytorch: torch.kthvalue
+
+    return gradient_mask
+
+
+@OPTIMIZERS.register_module(
+    group_key=default_group, module_name='ChildTuningAdamW')
+class ChildTuningAdamW(Optimizer):
+
+    def __init__(self,
+                 params: Iterable[torch.nn.parameter.Parameter],
+                 lr: float = 1e-3,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-6,
+                 weight_decay: float = 0.0,
+                 correct_bias: bool = True,
+                 reserve_p=1.0,
+                 mode=None):
+        if lr < 0.0:
+            raise ValueError(
+                'Invalid learning rate: {} - should be >= 0.0'.format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError(
+                'Invalid epsilon value: {} - should be >= 0.0'.format(eps))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            correct_bias=correct_bias)
+        super().__init__(params, defaults)
+
+        self.gradient_mask = None
+        self.reserve_p = reserve_p
+        self.mode = mode
+
+    def set_gradient_mask(self, gradient_mask):
+        self.gradient_mask = gradient_mask
+
+    def step(self, closure: Callable = None):
+        """
+        Performs a single optimization step.
+        Arguments:
+            closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Adam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                # ChildTuning code
+                if self.mode is not None:
+                    if self.mode == 'ChildTuning-D':
+                        if p in self.gradient_mask:
+                            grad *= self.gradient_mask[p]
+                    else:
+                        # ChildTuning-F
+                        grad_mask = Bernoulli(
+                            grad.new_full(
+                                size=grad.size(), fill_value=self.reserve_p))
+                        grad *= grad_mask.sample() / self.reserve_p
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1**state['step']
+                    bias_correction2 = 1.0 - beta2**state['step']
+                    step_size = step_size * math.sqrt(
+                        bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay'])
+
+        return loss
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index dc8c5c09..290478cb 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -800,6 +800,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.invoke_hook(TrainerStages.after_train_iter)
                 del self.data_batch
                 self._iter += 1
+                self._mode = ModeKeys.TRAIN
 
                 if i + 1 >= self.iters_per_epoch:
                     break
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 847e47ef..24f1a2fd 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -6,9 +6,15 @@ import unittest
 
 from modelscope.metainfo import Preprocessors, Trainers
 from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
+from modelscope.trainers.hooks import Hook
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
+    calculate_fisher
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.data_utils import to_device
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
@@ -69,6 +75,10 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
 
     @unittest.skip
     def test_finetune_afqmc(self):
+        """This unittest is used to reproduce the clue:afqmc dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
 
         def cfg_modify_fn(cfg):
             cfg.task = Tasks.sentence_similarity
@@ -114,7 +124,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         dc.local_files_only = True
         dataset = load_dataset('clue', 'afqmc', download_config=dc)
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
@@ -124,6 +134,10 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
 
     @unittest.skip
     def test_finetune_tnews(self):
+        """This unittest is used to reproduce the clue:tnews dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
 
         def cfg_modify_fn(cfg):
             # TODO no proper task for tnews
@@ -175,13 +189,21 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         dataset = load_dataset('clue', 'tnews', download_config=dc)
 
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
     @unittest.skip
     def test_veco_xnli(self):
+        """This unittest is used to reproduce the xnli dataset + veco model training results.
+
+        Here we follow the training scenario listed in the Alicemind open source project:
+        https://github.com/alibaba/AliceMind/tree/main/VECO
+        by training the english language subset.
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         from datasets import load_dataset
         langs = ['en']
         langs_eval = ['en']
@@ -267,6 +289,112 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             name=Trainers.nlp_veco_trainer,
             cfg_modify_fn=cfg_modify_fn)
 
+    @unittest.skip
+    def test_finetune_cluewsc(self):
+        """This unittest is used to reproduce the clue:wsc dataset + structbert model training results.
+
+        A runnable sample of child-tuning is also showed here.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
+        child_tuning_type = 'ChildTuning-F'
+        mode = {}
+        if child_tuning_type is not None:
+            mode = {'mode': child_tuning_type, 'reserve_p': 0.2}
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'text',
+                    'second_sequence': 'text2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.dataloader.batch_size_per_gpu = 16
+            cfg.train.max_epochs = 30
+            cfg.train.optimizer = {
+                'type':
+                'AdamW' if child_tuning_type is None else 'ChildTuningAdamW',
+                'lr': 1e-5,
+                'options': {},
+                **mode,
+            }
+            cfg.train.lr_scheduler = {
+                'type':
+                'LinearLR',
+                'start_factor':
+                1.0,
+                'end_factor':
+                0.0,
+                'total_iters':
+                int(
+                    len(dataset['train'])
+                    / cfg.train.dataloader.batch_size_per_gpu)
+                * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 30
+            }]
+            return cfg
+
+        def add_sentence2(features):
+            return {
+                'text2':
+                features['target']['span2_text'] + '指代'
+                + features['target']['span1_text']
+            }
+
+        dataset = MsDataset.load('clue', subset_name='cluewsc2020')
+        dataset = {
+            k: v.to_hf_dataset().map(add_sentence2)
+            for k, v in dataset.items()
+        }
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: NlpEpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        class CalculateFisherHook(Hook):
+
+            @staticmethod
+            def forward_step(model, inputs):
+                inputs = to_device(inputs, trainer.device)
+                trainer.train_step(model, inputs)
+                return trainer.train_outputs['loss']
+
+            def before_run(self, trainer: NlpEpochBasedTrainer):
+                v = calculate_fisher(trainer.model, trainer.train_dataloader,
+                                     self.forward_step, 0.2)
+                trainer.optimizer.set_gradient_mask(v)
+
+        if child_tuning_type == 'ChildTuning-D':
+            trainer.register_hook(CalculateFisherHook())
+        trainer.train()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index 520d1a3c..c34410be 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -47,6 +47,11 @@ class TestFinetuneTokenClassification(unittest.TestCase):
 
     @unittest.skip
     def test_word_segmentation(self):
+        """This unittest is used to reproduce the icwb2:pku dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         os.system(
             f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip'
         )
@@ -114,7 +119,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
             return cfg
 
         self.finetune(
-            'damo/nlp_structbert_backbone_tiny_std',
+            'damo/nlp_structbert_backbone_base_std',
             train_dataset,
             dev_dataset,
             cfg_modify_fn=cfg_modify_fn)

From 745bd5a9e00b0981a52dfc244f2ebc33e11c94cd Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Tue, 30 Aug 2022 14:28:25 +0800
Subject: [PATCH 022/175] [to #42322933] remove some unittest about asr        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9941890

---
 .../test_automatic_speech_recognition.py      | 62 -------------------
 1 file changed, 62 deletions(-)

diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 88ebcdbd..a83f5031 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -53,14 +53,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
         },
-        'test_run_with_ark_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
-        'test_run_with_tfrecord_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
         'dataset_example': {
             'Wrd': 49532,  # the number of words
             'Snt': 5000,  # the number of sentences
@@ -252,60 +244,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_ark_dataset(self):
-        '''run with datasets, and audio format is kaldi_ark
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.ark
-                 data.scp
-                 data.text
-               dev    # devsets
-                 data.ark
-                 data.scp
-                 data.text
-               train  # trainsets
-                 data.ark
-                 data.scp
-                 data.text
-        '''
-
-        logger.info('Run ASR test with ark dataset (pytorch)...')
-        logger.info('Downloading ark testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, AISHELL1_TESTSETS_FILE),
-            AISHELL1_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_pytorch_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_ark_dataset', rec_result)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_tfrecord_dataset(self):
-        '''run with datasets, and audio format is tfrecord
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.records
-                 data.idx
-                 data.text
-        '''
-
-        logger.info('Run ASR test with tfrecord dataset (tensorflow)...')
-        logger.info('Downloading tfrecord testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, TFRECORD_TESTSETS_FILE),
-            TFRECORD_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_tf_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_tfrecord_dataset', rec_result)
-
 
 if __name__ == '__main__':
     unittest.main()

From 2b64cf2bb6f22f55f99abf9b700fa05a4844cdbf Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Tue, 30 Aug 2022 15:15:15 +0800
Subject: [PATCH 023/175] =?UTF-8?q?[to=20#42322933]=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E4=BB=8Edataset=20json=E6=96=87=E4=BB=B6=E4=B8=AD=E8=8E=B7?=
 =?UTF-8?q?=E5=8F=96=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* dataset json file add args
---
 modelscope/msdatasets/ms_dataset.py           |  6 ++--
 ...mage_instance_segmentation_coco_dataset.py |  8 ++---
 modelscope/msdatasets/utils/dataset_utils.py  | 16 ++++++----
 tests/msdatasets/test_ms_dataset.py           |  5 ++--
 tests/trainers/test_finetune_mplug.py         |  1 -
 ...est_image_instance_segmentation_trainer.py | 30 ++++++-------------
 6 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 454044a4..b5527734 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -248,15 +248,15 @@ class MsDataset:
                 break
         target_subset_name, target_dataset_structure = get_target_dataset_structure(
             dataset_json, subset_name, split)
-        meta_map, file_map = get_dataset_files(target_dataset_structure,
-                                               dataset_name, namespace,
-                                               version)
+        meta_map, file_map, args_map = get_dataset_files(
+            target_dataset_structure, dataset_name, namespace, version)
         builder = load_dataset_builder(
             dataset_name,
             subset_name,
             namespace,
             meta_data_files=meta_map,
             zip_data_files=file_map,
+            args_map=args_map,
             cache_dir=MS_DATASETS_CACHE,
             version=version,
             split=list(target_dataset_structure.keys()),
diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index a001fe36..10cf7bfb 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -60,6 +60,8 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
                  classes=None,
                  seg_prefix=None,
                  folder_name=None,
+                 ann_file=None,
+                 img_prefix=None,
                  test_mode=False,
                  filter_empty_gt=True,
                  **kwargs):
@@ -69,11 +71,9 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
         self.split = next(iter(split_config.keys()))
         self.preprocessor = preprocessor
 
-        self.ann_file = osp.join(self.data_root,
-                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.ann_file = osp.join(self.data_root, ann_file)
 
-        self.img_prefix = osp.join(self.data_root,
-                                   DATASET_STRUCTURE[self.split]['images'])
+        self.img_prefix = osp.join(self.data_root, img_prefix)
         self.seg_prefix = seg_prefix
         self.test_mode = test_mode
         self.filter_empty_gt = filter_empty_gt
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 08a6de84..769bed93 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -1,6 +1,6 @@
 import os
 from collections import defaultdict
-from typing import Mapping, Optional, Sequence, Union
+from typing import Any, Mapping, Optional, Sequence, Union
 
 from datasets.builder import DatasetBuilder
 
@@ -92,6 +92,7 @@ def get_dataset_files(subset_split_into: dict,
     """
     meta_map = defaultdict(dict)
     file_map = defaultdict(dict)
+    args_map = defaultdict(dict)
     from modelscope.hub.api import HubApi
     modelscope_api = HubApi()
     for split, info in subset_split_into.items():
@@ -99,7 +100,8 @@ def get_dataset_files(subset_split_into: dict,
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
-    return meta_map, file_map
+        args_map[split] = info.get('args')
+    return meta_map, file_map, args_map
 
 
 def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
@@ -107,12 +109,16 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
                                                              Sequence[str]]],
                          zip_data_files: Mapping[str, Union[str,
                                                             Sequence[str]]],
-                         cache_dir: str, version: Optional[Union[str]],
-                         split: Sequence[str],
+                         args_map: Mapping[str, Any], cache_dir: str,
+                         version: Optional[Union[str]], split: Sequence[str],
                          **config_kwargs) -> DatasetBuilder:
     sub_dir = os.path.join(version, '_'.join(split))
     meta_data_file = next(iter(meta_data_files.values()))
     if not meta_data_file:
+        args_map = next(iter(args_map.values()))
+        if args_map is None:
+            args_map = {}
+        args_map.update(config_kwargs)
         builder_instance = TaskSpecificDatasetBuilder(
             dataset_name=dataset_name,
             namespace=namespace,
@@ -121,7 +127,7 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
             meta_data_files=meta_data_files,
             zip_data_files=zip_data_files,
             hash=sub_dir,
-            **config_kwargs)
+            **args_map)
     elif meta_data_file.endswith('.csv'):
         builder_instance = MsCsvDatasetBuilder(
             dataset_name=dataset_name,
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 1d62d2d1..ed07def7 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -36,9 +36,8 @@ class MsDatasetTest(unittest.TestCase):
         ms_ds_train = MsDataset.load(
             'pets_small',
             namespace=DEFAULT_DATASET_NAMESPACE,
-            split='train',
-            classes=('1', '2'),
-            folder_name='Pets')
+            download_mode=DownloadMode.FORCE_REDOWNLOAD,
+            split='train')
         print(ms_ds_train.config_kwargs)
         assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
 
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index 1298f1cd..351600c6 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -20,7 +20,6 @@ class TestFinetuneMPlug(unittest.TestCase):
         self.tmp_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
-
         from modelscope.utils.constant import DownloadMode
         datadict = MsDataset.load(
             'coco_captions_small_slice',
diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py
index 774f8fa8..03f7eea3 100644
--- a/tests/trainers/test_image_instance_segmentation_trainer.py
+++ b/tests/trainers/test_image_instance_segmentation_trainer.py
@@ -15,7 +15,7 @@ from modelscope.msdatasets.task_datasets import \
     ImageInstanceSegmentationCocoDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.test_utils import test_level
 
 
@@ -41,38 +41,26 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         if train_data_cfg is None:
             # use default toy data
             train_data_cfg = ConfigDict(
-                name='pets_small',
-                split='train',
-                classes=('Cat', 'Dog'),
-                folder_name='Pets',
-                test_mode=False)
+                name='pets_small', split='train', test_mode=False)
         if val_data_cfg is None:
             val_data_cfg = ConfigDict(
-                name='pets_small',
-                split='validation',
-                classes=('Cat', 'Dog'),
-                folder_name='Pets',
-                test_mode=True)
+                name='pets_small', split='validation', test_mode=True)
 
         self.train_dataset = MsDataset.load(
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
-            classes=train_data_cfg.classes,
-            folder_name=train_data_cfg.folder_name,
-            test_mode=train_data_cfg.test_mode)
-        assert self.train_dataset.config_kwargs[
-            'classes'] == train_data_cfg.classes
+            test_mode=train_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.train_dataset.config_kwargs['classes']
         assert next(
             iter(self.train_dataset.config_kwargs['split_config'].values()))
 
         self.eval_dataset = MsDataset.load(
             dataset_name=val_data_cfg.name,
             split=val_data_cfg.split,
-            classes=val_data_cfg.classes,
-            folder_name=val_data_cfg.folder_name,
-            test_mode=val_data_cfg.test_mode)
-        assert self.eval_dataset.config_kwargs[
-            'classes'] == val_data_cfg.classes
+            test_mode=val_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.eval_dataset.config_kwargs['classes']
         assert next(
             iter(self.eval_dataset.config_kwargs['split_config'].values()))
 

From 2b380f0410f8b84969a1ed64fe376ab808aaeb6e Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Tue, 30 Aug 2022 15:30:00 +0800
Subject: [PATCH 024/175] [to #42322933] add new Task - document segmentation  
       Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9942858

    * [Add] add document-segmentation
---
 modelscope/metainfo.py                        |   3 +
 modelscope/models/nlp/__init__.py             |   2 +
 .../nlp/bert_for_document_segmentation.py     | 108 +++++++++
 modelscope/pipelines/nlp/__init__.py          |   2 +
 .../nlp/document_segmentation_pipeline.py     | 175 ++++++++++++++
 modelscope/preprocessors/__init__.py          |   2 +
 modelscope/preprocessors/slp.py               | 223 ++++++++++++++++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_document_segmentation.py |  56 +++++
 9 files changed, 572 insertions(+)
 create mode 100644 modelscope/models/nlp/bert_for_document_segmentation.py
 create mode 100644 modelscope/pipelines/nlp/document_segmentation_pipeline.py
 create mode 100644 modelscope/preprocessors/slp.py
 create mode 100644 tests/pipelines/test_document_segmentation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index b4d005a7..908ee011 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -45,6 +45,7 @@ class Models(object):
     tcrf = 'transformer-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    bert_for_ds = 'bert-for-document-segmentation'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -151,6 +152,7 @@ class Pipelines(object):
     text_error_correction = 'text-error-correction'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    document_segmentation = 'document-segmentation'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -240,6 +242,7 @@ class Preprocessors(object):
     fill_mask = 'fill-mask'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    document_segmentation = 'document-segmentation'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 13be9096..8bf06c1d 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
     from .backbones import SbertModel
     from .heads import SequenceClassificationHead
     from .bert_for_sequence_classification import BertForSequenceClassification
+    from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM)
@@ -30,6 +31,7 @@ else:
         'heads': ['SequenceClassificationHead'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
+        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'masked_language':
         ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
         'nncrf_for_named_entity_recognition':
diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert_for_document_segmentation.py
new file mode 100644
index 00000000..dfa57597
--- /dev/null
+++ b/modelscope/models/nlp/bert_for_document_segmentation.py
@@ -0,0 +1,108 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.models.bert.modeling_bert import (BertModel,
+                                                    BertPreTrainedModel)
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+
+__all__ = ['BertForDocumentSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.document_segmentation, module_name=Models.bert_for_ds)
+class BertForDocumentSegmentation(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+    def build_with_config(self, config):
+        self.bert_model = BertForDocumentSegmentationBase.from_pretrained(
+            self.model_dir, from_tf=False, config=config)
+        return self.bert_model
+
+    def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]:
+        pass
+
+
+class BertForDocumentSegmentationBase(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.sentence_pooler_type = None
+        self.bert = BertModel(config, add_pooling_layer=False)
+
+        classifier_dropout = config.hidden_dropout_prob
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.class_weights = None
+        self.init_weights()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                sentence_attention_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        if self.sentence_pooler_type is not None:
+            raise NotImplementedError
+        else:
+            sequence_output = self.dropout(sequence_output)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(weight=self.class_weights)
+            if sentence_attention_mask is not None:
+                active_loss = sentence_attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 51803872..2dd5bf62 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -8,6 +8,7 @@ if TYPE_CHECKING:
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
+    from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
@@ -30,6 +31,7 @@ else:
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
+        'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
         'single_sentence_classification_pipeline':
         ['SingleSentenceClassificationPipeline'],
diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
new file mode 100644
index 00000000..00837bf3
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import re
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+from datasets import Dataset
+from transformers.models.bert.modeling_bert import BertConfig
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DocumentSegmentationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['DocumentSegmentationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_segmentation, module_name=Pipelines.document_segmentation)
+class DocumentSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: DocumentSegmentationPreprocessor = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        self.model_dir = model.model_dir
+        config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
+
+        self.document_segmentation_model = model.build_with_config(
+            config=config)
+
+        if preprocessor is None:
+            preprocessor = DocumentSegmentationPreprocessor(
+                self.model_dir, config)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+    def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        output = self.predict(documents)
+        output = self.postprocess(output)
+        return output
+
+    def predict(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        pred_samples = self.cut_documents(documents)
+        predict_examples = Dataset.from_dict(pred_samples)
+
+        # Predict Feature Creation
+        predict_dataset = self.preprocessor(predict_examples)
+        num_examples = len(
+            predict_examples[self.preprocessor.context_column_name])
+        num_samples = len(
+            predict_dataset[self.preprocessor.context_column_name])
+
+        predict_dataset.pop('segment_ids')
+        labels = predict_dataset.pop('labels')
+        sentences = predict_dataset.pop('sentences')
+        example_ids = predict_dataset.pop(
+            self.preprocessor.example_id_column_name)
+
+        with torch.no_grad():
+            input = {
+                key: torch.tensor(val)
+                for key, val in predict_dataset.items()
+            }
+            predictions = self.document_segmentation_model.forward(
+                **input).logits
+
+        predictions = np.argmax(predictions, axis=2)
+        assert len(sentences) == len(
+            predictions), 'sample {}  infer_sample {} prediction {}'.format(
+                num_samples, len(sentences), len(predictions))
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [
+                self.preprocessor.label_list[p]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        true_labels = [
+            [
+                self.preprocessor.label_list[l]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        # Save predictions
+        out = []
+        for i in range(num_examples):
+            out.append({'sentences': [], 'labels': [], 'predictions': []})
+
+        for prediction, sentence_list, label, example_id in zip(
+                true_predictions, sentences, true_labels, example_ids):
+            if len(label) < len(sentence_list):
+                label.append('B-EOP')
+                prediction.append('B-EOP')
+            assert len(sentence_list) == len(prediction), '{} {}'.format(
+                len(sentence_list), len(prediction))
+            assert len(sentence_list) == len(label), '{} {}'.format(
+                len(sentence_list), len(label))
+            out[example_id]['sentences'].extend(sentence_list)
+            out[example_id]['labels'].extend(label)
+            out[example_id]['predictions'].extend(prediction)
+
+        return out
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = []
+        list_count = len(inputs)
+        for num in range(list_count):
+            res = []
+            for s, p in zip(inputs[num]['sentences'],
+                            inputs[num]['predictions']):
+                s = s.strip()
+                if p == 'B-EOP':
+                    s = ''.join([s, '\n\t'])
+                res.append(s)
+
+            document = ('\t' + ''.join(res))
+            result.append(document)
+
+        if list_count == 1:
+            return {OutputKeys.TEXT: result[0]}
+        else:
+            return {OutputKeys.TEXT: result}
+
+    def cut_documents(self, para: Union[List[str], str]):
+        document_list = para
+        if isinstance(para, str):
+            document_list = [para]
+        sentences = []
+        labels = []
+        example_id = []
+        id = 0
+        for document in document_list:
+            sentence = self.cut_sentence(document)
+            label = ['O'] * (len(sentence) - 1) + ['B-EOP']
+            sentences.append(sentence)
+            labels.append(label)
+            example_id.append(id)
+            id += 1
+
+        return {
+            'example_id': example_id,
+            'sentences': sentences,
+            'labels': labels
+        }
+
+    def cut_sentence(self, para):
+        para = re.sub(r'([。！.!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)  # noqa *
+        para = para.rstrip()
+        return [_ for _ in para.split('\n') if _]
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index ce9df454..f5ac0e4e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
                       FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
                       NERPreprocessor, TextErrorCorrectionPreprocessor,
                       FaqQuestionAnsweringPreprocessor)
+    from .slp import DocumentSegmentationPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -52,6 +53,7 @@ else:
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor'
         ],
+        'slp': ['DocumentSegmentationPreprocessor'],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor', 'InputFeatures'
diff --git a/modelscope/preprocessors/slp.py b/modelscope/preprocessors/slp.py
new file mode 100644
index 00000000..d9c2d9b7
--- /dev/null
+++ b/modelscope/preprocessors/slp.py
@@ -0,0 +1,223 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from transformers import BertTokenizerFast
+
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
+
+__all__ = ['DocumentSegmentationPreprocessor']
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            print(str(e))
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 66f734f9..dae7117e 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -98,6 +98,7 @@ class NLPTasks(object):
     text_error_correction = 'text-error-correction'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    document_segmentation = 'document-segmentation'
 
 
 class AudioTasks(object):
diff --git a/tests/pipelines/test_document_segmentation.py b/tests/pipelines/test_document_segmentation.py
new file mode 100644
index 00000000..39609be8
--- /dev/null
+++ b/tests/pipelines/test_document_segmentation.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+from typing import Any, Dict
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class DocumentSegmentationTest(unittest.TestCase):
+
+    model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
+    eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
+    sentences = '近年来，随着端到端语音识别的流行，基于Transformer结构的语音识别系统逐渐成为了主流。然而，由于Transformer是一种自回归模型，需要逐个生成目标文字，计算复杂度随着目标文字数量线性增加，限制了其在工业生产中的应用。针对Transoformer模型自回归生成文字的低计算效率缺陷，学术界提出了非自回归模型来并行的输出目标文字。根据生成目标文字时，迭代轮数，非自回归模型分为：多轮迭代式与单轮迭代非自回归模型。其中实用的是基于单轮迭代的非自回归模型。对于单轮非自回归模型，现有工作往往聚焦于如何更加准确的预测目标文字个数，如CTC-enhanced采用CTC预测输出文字个数，尽管如此，考虑到现实应用中，语速、口音、静音以及噪声等因素的影响，如何准确的预测目标文字个数以及抽取目标文字对应的声学隐变量仍然是一个比较大的挑战；另外一方面，我们通过对比自回归模型与单轮非自回归模型在工业大数据上的错误类型（如下图所示，AR与vanilla NAR），发现，相比于自回归模型，非自回归模型，在预测目标文字个数方面差距较小，但是替换错误显著的增加，我们认为这是由于单轮非自回归模型中条件独立假设导致的语义信息丢失。于此同时，目前非自回归模型主要停留在学术验证阶段，还没有工业大数据上的相关实验与结论。'  # noqa *
+    sentences_1 = '移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。'  # noqa *
+    eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (1220–1263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.'  # noqa *
+
+    def run_pipeline(self, model_id: str, documents: str) -> Dict[str, Any]:
+        p = pipeline(task=Tasks.document_segmentation, model=model_id)
+
+        result = p(documents=documents)
+
+        return result
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_document(self):
+        logger.info('Run document segmentation with one document ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id, documents=self.sentences)
+        print(result[OutputKeys.TEXT])
+
+        result = self.run_pipeline(
+            model_id=self.eng_model_id, documents=self.eng_sentences)
+        print(result[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_documents(self):
+        logger.info('Run document segmentation with many documents ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id,
+            documents=[self.sentences, self.sentences_1])
+
+        documents_list = result[OutputKeys.TEXT]
+        for document in documents_list:
+            print(document)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 12698b31a078fb32762b6c75a54eb48bff1cf671 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 30 Aug 2022 17:59:15 +0800
Subject: [PATCH 025/175] [to #44340132] fix: ci case run out of gpu memory

---
 .dev_scripts/ci_container_test.sh             |   9 +-
 .dev_scripts/dockerci.sh                      |   3 +-
 modelscope/msdatasets/ms_dataset.py           |  83 ++---
 modelscope/pipelines/cv/ocr_utils/ops.py      |   6 +
 modelscope/trainers/trainer.py                |   3 +-
 modelscope/utils/device.py                    |  11 +-
 tests/isolated_cases.txt                      |   6 +
 tests/pipelines/test_multi_modal_embedding.py |   7 +-
 tests/run.py                                  | 284 +++++++++++++++++-
 .../test_image_color_enhance_trainer.py       |  76 +++--
 ...test_image_portrait_enhancement_trainer.py |  88 +++---
 tests/trainers/test_trainer.py                |   7 +-
 12 files changed, 433 insertions(+), 150 deletions(-)
 create mode 100644 tests/isolated_cases.txt

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 98e9f88d..2f18aff7 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -19,4 +19,11 @@ fi
 # test with install
 python setup.py install
 
-python tests/run.py
+if [ $# -eq 0 ]; then
+    ci_command="python tests/run.py --subprocess"
+else
+    ci_command="$@"
+fi
+echo "Running case with command: $ci_command"
+$ci_command
+#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index 95dd0e1a..dbb79514 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
-CI_COMMAND=${CI_COMMAND:-'bash .dev_scripts/ci_container_test.sh'}
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
+echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
   exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index b5527734..338c6333 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,9 +1,11 @@
+import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import json
 import numpy as np
+import torch
 from datasets import Dataset, DatasetDict
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
@@ -40,6 +42,46 @@ def format_list(para) -> List:
     return para
 
 
+class MsIterableDataset(torch.utils.data.IterableDataset):
+
+    def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
+                 columns):
+        super(MsIterableDataset).__init__()
+        self.dataset = dataset
+        self.preprocessor_list = preprocessor_list
+        self.retained_columns = retained_columns
+        self.columns = columns
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading
+            iter_start = 0
+            iter_end = len(self.dataset)
+        else:  # in a worker process
+            per_worker = math.ceil(
+                len(self.dataset) / float(worker_info.num_workers))
+            worker_id = worker_info.id
+            iter_start = worker_id * per_worker
+            iter_end = min(iter_start + per_worker, len(self.dataset))
+
+        for idx in range(iter_start, iter_end):
+            item_dict = self.dataset[idx]
+            res = {
+                k: np.array(item_dict[k])
+                for k in self.columns if k in self.retained_columns
+            }
+            for preprocessor in self.preprocessor_list:
+                res.update({
+                    k: np.array(v)
+                    for k, v in preprocessor(item_dict).items()
+                    if k in self.retained_columns
+                })
+            yield res
+
+
 class MsDataset:
     """
     ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
@@ -318,45 +360,8 @@ class MsDataset:
                 continue
             retained_columns.append(k)
 
-        import math
-        import torch
-
-        class MsIterableDataset(torch.utils.data.IterableDataset):
-
-            def __init__(self, dataset: Iterable):
-                super(MsIterableDataset).__init__()
-                self.dataset = dataset
-
-            def __len__(self):
-                return len(self.dataset)
-
-            def __iter__(self):
-                worker_info = torch.utils.data.get_worker_info()
-                if worker_info is None:  # single-process data loading
-                    iter_start = 0
-                    iter_end = len(self.dataset)
-                else:  # in a worker process
-                    per_worker = math.ceil(
-                        len(self.dataset) / float(worker_info.num_workers))
-                    worker_id = worker_info.id
-                    iter_start = worker_id * per_worker
-                    iter_end = min(iter_start + per_worker, len(self.dataset))
-
-                for idx in range(iter_start, iter_end):
-                    item_dict = self.dataset[idx]
-                    res = {
-                        k: np.array(item_dict[k])
-                        for k in columns if k in retained_columns
-                    }
-                    for preprocessor in preprocessor_list:
-                        res.update({
-                            k: np.array(v)
-                            for k, v in preprocessor(item_dict).items()
-                            if k in retained_columns
-                        })
-                    yield res
-
-        return MsIterableDataset(self._hf_ds)
+        return MsIterableDataset(self._hf_ds, preprocessor_list,
+                                 retained_columns, columns)
 
     def to_torch_dataset(
         self,
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index eeab36a0..09807b10 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -1,8 +1,10 @@
 import math
 import os
 import shutil
+import sys
 import uuid
 
+import absl.flags as absl_flags
 import cv2
 import numpy as np
 import tensorflow as tf
@@ -12,6 +14,10 @@ from . import utils
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 
+# skip parse sys.argv in tf, so fix bug:
+# absl.flags._exceptions.UnrecognizedFlagError:
+# Unknown command line flag 'OCRDetectionPipeline: Unknown command line flag
+absl_flags.FLAGS(sys.argv, known_only=True)
 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_string('weight_init_method', 'xavier',
                            'Weight initialization method')
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 290478cb..614b728a 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -312,7 +312,8 @@ class EpochBasedTrainer(BaseTrainer):
                     else ConfigDict(type=None, mode=mode)
                 return datasets.to_torch_dataset(
                     task_data_config=cfg,
-                    task_name=self.cfg.task,
+                    task_name=self.cfg.task
+                    if hasattr(self.cfg, ConfigFields.task) else None,
                     preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index aa8fda66..77e23122 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -8,12 +8,6 @@ from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
-if is_tf_available():
-    import tensorflow as tf
-
-if is_torch_available():
-    import torch
-
 
 def verify_device(device_name):
     """ Verify device is valid, device should be either cpu, cuda, gpu, cuda:X or gpu:X.
@@ -63,6 +57,7 @@ def device_placement(framework, device_name='gpu:0'):
     device_type, device_id = verify_device(device_name)
 
     if framework == Frameworks.tf:
+        import tensorflow as tf
         if device_type == Devices.gpu and not tf.test.is_gpu_available():
             logger.warning(
                 'tensorflow cuda is not available, using cpu instead.')
@@ -76,6 +71,7 @@ def device_placement(framework, device_name='gpu:0'):
                     yield
 
     elif framework == Frameworks.torch:
+        import torch
         if device_type == Devices.gpu:
             if torch.cuda.is_available():
                 torch.cuda.set_device(f'cuda:{device_id}')
@@ -86,12 +82,13 @@ def device_placement(framework, device_name='gpu:0'):
         yield
 
 
-def create_device(device_name) -> torch.DeviceObjType:
+def create_device(device_name):
     """ create torch device
 
     Args:
         device_name (str):  cpu, gpu, gpu:0, cuda:0 etc.
     """
+    import torch
     device_type, device_id = verify_device(device_name)
     use_cuda = False
     if device_type == Devices.gpu:
diff --git a/tests/isolated_cases.txt b/tests/isolated_cases.txt
new file mode 100644
index 00000000..be85142a
--- /dev/null
+++ b/tests/isolated_cases.txt
@@ -0,0 +1,6 @@
+ test_text_to_speech.py
+ test_multi_modal_embedding.py
+ test_ofa_tasks.py
+ test_video_summarization.py
+ test_dialog_modeling.py
+ test_csanmt_translation.py
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 6152f279..f94e31fa 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -31,11 +31,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
+        model = Model.from_pretrained(
+            self.model_id, revision=self.model_version)
         pipeline_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding,
-            model=model,
-            model_revision=self.model_version)
+            task=Tasks.multi_modal_embedding, model=model)
         text_embedding = pipeline_multi_modal_embedding(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
diff --git a/tests/run.py b/tests/run.py
index 27af7fe5..1a601eda 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -2,11 +2,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
+import datetime
+import multiprocessing
 import os
+import subprocess
 import sys
+import tempfile
 import unittest
 from fnmatch import fnmatch
+from multiprocessing.managers import BaseManager
+from pathlib import Path
+from turtle import shape
+from unittest import TestResult, TextTestResult
 
+import pandas
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
 #         A segmentation fault may be raise by pytorch cpp library
 #         if 'import tensorflow' in front of 'import torch'.
@@ -19,6 +28,227 @@ from modelscope.utils.test_utils import set_test_level, test_level
 logger = get_logger()
 
 
+def test_cases_result_to_df(result_list):
+    table_header = [
+        'Name', 'Result', 'Info', 'Start time', 'Stop time',
+        'Time cost(seconds)'
+    ]
+    df = pandas.DataFrame(
+        result_list, columns=table_header).sort_values(
+            by=['Start time'], ascending=True)
+    return df
+
+
+def statistics_test_result(df):
+    total_cases = df.shape[0]
+    # yapf: disable
+    success_cases = df.loc[df['Result'] == 'Success'].shape[0]
+    error_cases = df.loc[df['Result'] == 'Error'].shape[0]
+    failures_cases = df.loc[df['Result'] == 'Failures'].shape[0]
+    expected_failure_cases = df.loc[df['Result'] == 'ExpectedFailures'].shape[0]
+    unexpected_success_cases = df.loc[df['Result'] == 'UnexpectedSuccesses'].shape[0]
+    skipped_cases = df.loc[df['Result'] == 'Skipped'].shape[0]
+    # yapf: enable
+
+    if failures_cases > 0 or \
+       error_cases > 0 or \
+       unexpected_success_cases > 0:
+        result = 'FAILED'
+    else:
+        result = 'SUCCESS'
+    result_msg = '%s (Runs=%s,success=%s,failures=%s,errors=%s,\
+    skipped=%s,expected failures=%s,unexpected successes=%s)' % (
+        result, total_cases, success_cases, failures_cases, error_cases,
+        skipped_cases, expected_failure_cases, unexpected_success_cases)
+
+    print(result_msg)
+    if result == 'FAILED':
+        sys.exit(1)
+
+
+def gather_test_suites_in_files(test_dir, case_file_list, list_tests):
+    test_suite = unittest.TestSuite()
+    for case in case_file_list:
+        test_case = unittest.defaultTestLoader.discover(
+            start_dir=test_dir, pattern=case)
+        test_suite.addTest(test_case)
+        if hasattr(test_case, '__iter__'):
+            for subcase in test_case:
+                if list_tests:
+                    print(subcase)
+        else:
+            if list_tests:
+                print(test_case)
+    return test_suite
+
+
+def gather_test_suites_files(test_dir, pattern):
+    case_file_list = []
+    for dirpath, dirnames, filenames in os.walk(test_dir):
+        for file in filenames:
+            if fnmatch(file, pattern):
+                case_file_list.append(file)
+    return case_file_list
+
+
+def collect_test_results(case_results):
+    result_list = [
+    ]  # each item is Case, Result, Start time, Stop time, Time cost
+    for case_result in case_results.successes:
+        result_list.append(
+            (case_result.test_full_name, 'Success', '', case_result.start_time,
+             case_result.stop_time, case_result.time_cost))
+    for case_result in case_results.errors:
+        result_list.append(
+            (case_result[0].test_full_name, 'Error', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.skipped:
+        result_list.append(
+            (case_result[0].test_full_name, 'Skipped', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.expectedFailures:
+        result_list.append(
+            (case_result[0].test_full_name, 'ExpectedFailures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.failures:
+        result_list.append(
+            (case_result[0].test_full_name, 'Failures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.unexpectedSuccesses:
+        result_list.append((case_result.test_full_name, 'UnexpectedSuccesses',
+                            '', case_result.start_time, case_result.stop_time,
+                            case_result.time_cost))
+    return result_list
+
+
+class TestSuiteRunner:
+
+    def run(self, msg_queue, test_dir, test_suite_file):
+        test_suite = unittest.TestSuite()
+        test_case = unittest.defaultTestLoader.discover(
+            start_dir=test_dir, pattern=test_suite_file)
+        test_suite.addTest(test_case)
+        runner = TimeCostTextTestRunner()
+        test_suite_result = runner.run(test_suite)
+        msg_queue.put(collect_test_results(test_suite_result))
+
+
+def run_command_with_popen(cmd):
+    with subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=1,
+            encoding='utf8') as sub_process:
+        for line in iter(sub_process.stdout.readline, ''):
+            sys.stdout.write(line)
+
+
+def run_in_subprocess(args):
+    # only case args.isolated_cases run in subporcess, all other run in a subprocess
+    test_suite_files = gather_test_suites_files(
+        os.path.abspath(args.test_dir), args.pattern)
+
+    if args.subprocess:  # run all case in subprocess
+        isolated_cases = test_suite_files
+    else:
+        isolated_cases = []
+        with open(args.isolated_cases, 'r') as f:
+            for line in f:
+                if line.strip() in test_suite_files:
+                    isolated_cases.append(line.strip())
+
+    if not args.list_tests:
+        with tempfile.TemporaryDirectory() as temp_result_dir:
+            for test_suite_file in isolated_cases:  # run case in subprocess
+                cmd = [
+                    'python', 'tests/run.py', '--pattern', test_suite_file,
+                    '--result_dir', temp_result_dir
+                ]
+                run_command_with_popen(cmd)
+            result_dfs = []
+            # run remain cases in a process.
+            remain_suite_files = [
+                item for item in test_suite_files if item not in isolated_cases
+            ]
+            test_suite = gather_test_suites_in_files(args.test_dir,
+                                                     remain_suite_files,
+                                                     args.list_tests)
+            if test_suite.countTestCases() > 0:
+                runner = TimeCostTextTestRunner()
+                result = runner.run(test_suite)
+                result = collect_test_results(result)
+                df = test_cases_result_to_df(result)
+                result_dfs.append(df)
+
+            # collect test results
+            result_path = Path(temp_result_dir)
+            for result in result_path.iterdir():
+                if Path.is_file(result):
+                    df = pandas.read_pickle(result)
+                    result_dfs.append(df)
+
+            result_pd = pandas.concat(
+                result_dfs)  # merge result of every test suite.
+            print_table_result(result_pd)
+            print_abnormal_case_info(result_pd)
+            statistics_test_result(result_pd)
+
+
+def get_object_full_name(obj):
+    klass = obj.__class__
+    module = klass.__module__
+    if module == 'builtins':
+        return klass.__qualname__
+    return module + '.' + klass.__qualname__
+
+
+class TimeCostTextTestResult(TextTestResult):
+    """Record test case time used!"""
+
+    def __init__(self, stream, descriptions, verbosity):
+        self.successes = []
+        return super(TimeCostTextTestResult,
+                     self).__init__(stream, descriptions, verbosity)
+
+    def startTest(self, test):
+        test.start_time = datetime.datetime.now()
+        test.test_full_name = get_object_full_name(
+            test) + '.' + test._testMethodName
+        self.stream.writeln('Test case:  %s start at: %s' %
+                            (test.test_full_name, test.start_time))
+
+        return super(TimeCostTextTestResult, self).startTest(test)
+
+    def stopTest(self, test):
+        TextTestResult.stopTest(self, test)
+        test.stop_time = datetime.datetime.now()
+        test.time_cost = (test.stop_time - test.start_time).total_seconds()
+        self.stream.writeln(
+            'Test case: %s stop at: %s, cost time: %s(seconds)' %
+            (test.test_full_name, test.stop_time, test.time_cost))
+        super(TimeCostTextTestResult, self).stopTest(test)
+
+    def addSuccess(self, test):
+        self.successes.append(test)
+        super(TextTestResult, self).addSuccess(test)
+
+
+class TimeCostTextTestRunner(unittest.runner.TextTestRunner):
+    resultclass = TimeCostTextTestResult
+
+    def run(self, test):
+        return super(TimeCostTextTestRunner, self).run(test)
+
+    def _makeResult(self):
+        result = super(TimeCostTextTestRunner, self)._makeResult()
+        return result
+
+
 def gather_test_cases(test_dir, pattern, list_tests):
     case_list = []
     for dirpath, dirnames, filenames in os.walk(test_dir):
@@ -42,16 +272,40 @@ def gather_test_cases(test_dir, pattern, list_tests):
     return test_suite
 
 
+def print_abnormal_case_info(df):
+    df = df.loc[(df['Result'] == 'Error') | (df['Result'] == 'Failures')]
+    for _, row in df.iterrows():
+        print('Case %s run result: %s, msg:\n%s' %
+              (row['Name'], row['Result'], row['Info']))
+
+
+def print_table_result(df):
+    df = df.loc[df['Result'] != 'Skipped']
+    df = df.drop('Info', axis=1)
+    formatters = {
+        'Name': '{{:<{}s}}'.format(df['Name'].str.len().max()).format,
+        'Result': '{{:<{}s}}'.format(df['Result'].str.len().max()).format,
+    }
+    with pandas.option_context('display.max_rows', None, 'display.max_columns',
+                               None, 'display.width', None):
+        print(df.to_string(justify='left', formatters=formatters, index=False))
+
+
 def main(args):
-    runner = unittest.TextTestRunner()
+    runner = TimeCostTextTestRunner()
     test_suite = gather_test_cases(
         os.path.abspath(args.test_dir), args.pattern, args.list_tests)
     if not args.list_tests:
         result = runner.run(test_suite)
-        if len(result.failures) > 0:
-            sys.exit(len(result.failures))
-        if len(result.errors) > 0:
-            sys.exit(len(result.errors))
+        result = collect_test_results(result)
+        df = test_cases_result_to_df(result)
+        if args.result_dir is not None:
+            file_name = str(int(datetime.datetime.now().timestamp() * 1000))
+            df.to_pickle(os.path.join(args.result_dir, file_name))
+        else:
+            print_table_result(df)
+            print_abnormal_case_info(df)
+            statistics_test_result(df)
 
 
 if __name__ == '__main__':
@@ -66,6 +320,18 @@ if __name__ == '__main__':
         '--level', default=0, type=int, help='2 -- all, 1 -- p1, 0 -- p0')
     parser.add_argument(
         '--disable_profile', action='store_true', help='disable profiling')
+    parser.add_argument(
+        '--isolated_cases',
+        default=None,
+        help='specified isolated cases config file')
+    parser.add_argument(
+        '--subprocess',
+        action='store_true',
+        help='run all test suite in subprocess')
+    parser.add_argument(
+        '--result_dir',
+        default=None,
+        help='Save result to directory, internal use only')
     args = parser.parse_args()
     set_test_level(args.level)
     logger.info(f'TEST LEVEL: {test_level()}')
@@ -73,4 +339,10 @@ if __name__ == '__main__':
         from utils import profiler
         logger.info('enable profile ...')
         profiler.enable()
-    main(args)
+    if args.isolated_cases is not None or args.subprocess:
+        run_in_subprocess(args)
+    elif args.isolated_cases is not None and args.subprocess:
+        print('isolated_cases and subporcess conflict')
+        sys.exit(1)
+    else:
+        main(args)
diff --git a/tests/trainers/test_image_color_enhance_trainer.py b/tests/trainers/test_image_color_enhance_trainer.py
index f1dcbe51..34d84cd2 100644
--- a/tests/trainers/test_image_color_enhance_trainer.py
+++ b/tests/trainers/test_image_color_enhance_trainer.py
@@ -17,6 +17,41 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root):
+        super(PairedImageDataset, self).__init__()
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        return self
+
+
 class TestImageColorEnhanceTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -27,47 +62,6 @@ class TestImageColorEnhanceTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root):
-                super(PairedImageDataset, self).__init__()
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/image_color_enhance/')
 
diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py
index dc450ff0..049adf7e 100644
--- a/tests/trainers/test_image_portrait_enhancement_trainer.py
+++ b/tests/trainers/test_image_portrait_enhancement_trainer.py
@@ -19,6 +19,47 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root, size=512):
+        super(PairedImageDataset, self).__init__()
+        self.size = size
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+        return (img - 0.5) / 0.5
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(
+            lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(
+            gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        # self.preprocessor = preprocessors
+        return self
+
+
 class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -29,53 +70,6 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root, size=512):
-                super(PairedImageDataset, self).__init__()
-                self.size = size
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-                return (img - 0.5) / 0.5
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(
-                    lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(
-                    gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                # self.preprocessor = preprocessors
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/face_enhancement/')
 
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index be29844d..17fa97f9 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -16,6 +16,7 @@ from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
 from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
+from modelscope.trainers.base import DummyTrainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
@@ -264,7 +265,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 1,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[2]))
         self.assertDictContainsSubset(
             {
@@ -284,7 +285,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 2,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[5]))
         self.assertDictContainsSubset(
             {
@@ -304,7 +305,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 3,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[8]))
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)

From 054151d92fbb9978f16a7a2b7b16fe7c5e7777a7 Mon Sep 17 00:00:00 2001
From: "xiangpeng.wxp" <xiangpeng.wxp@alibaba-inc.com>
Date: Tue, 30 Aug 2022 22:08:27 +0800
Subject: [PATCH 026/175] [to #42322933]nlp_translation_preprocess

* nlp translation preprocess branch
 * pull the latest master
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9920445
---
 .../pipelines/nlp/translation_pipeline.py     | 30 ++++++++++++++++++-
 .../nlp/csanmt_translation_trainer.py         |  6 ++--
 requirements/nlp.txt                          |  3 ++
 tests/pipelines/test_csanmt_translation.py    | 20 +++++++++----
 4 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index b9b74ce4..e4893577 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -1,8 +1,11 @@
 import os.path as osp
 from typing import Any, Dict
 
+import jieba
 import numpy as np
 import tensorflow as tf
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+from subword_nmt import apply_bpe
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
@@ -59,6 +62,21 @@ class TranslationPipeline(Pipeline):
             dtype=tf.int64, shape=[None, None], name='input_wids')
         self.output = {}
 
+        # preprocess
+        self._src_lang = self.cfg['preprocessor']['src_lang']
+        self._tgt_lang = self.cfg['preprocessor']['tgt_lang']
+        self._src_bpe_path = osp.join(
+            model, self.cfg['preprocessor']['src_bpe']['file'])
+
+        if self._src_lang == 'zh':
+            self._tok = jieba
+        else:
+            self._punct_normalizer = MosesPunctNormalizer(lang=self._src_lang)
+            self._tok = MosesTokenizer(lang=self._src_lang)
+        self._detok = MosesDetokenizer(lang=self._tgt_lang)
+
+        self._bpe = apply_bpe.BPE(open(self._src_bpe_path))
+
         # model
         output = self.model(self.input_wids)
         self.output.update(output)
@@ -70,10 +88,19 @@ class TranslationPipeline(Pipeline):
             model_loader.restore(sess, model_path)
 
     def preprocess(self, input: str) -> Dict[str, Any]:
+        if self._src_lang == 'zh':
+            input_tok = self._tok.cut(input)
+            input_tok = ' '.join(list(input_tok))
+        else:
+            input = self._punct_normalizer.normalize(input)
+            input_tok = self._tok.tokenize(
+                input, return_str=True, aggressive_dash_splits=True)
+
+        input_bpe = self._bpe.process_line(input_tok)
         input_ids = np.array([[
             self._src_vocab[w]
             if w in self._src_vocab else self.cfg['model']['src_vocab_size']
-            for w in input.strip().split()
+            for w in input_bpe.strip().split()
         ]])
         result = {'input_ids': input_ids}
         return result
@@ -92,5 +119,6 @@ class TranslationPipeline(Pipeline):
             self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
             for wid in wids
         ]).replace('@@ ', '').replace('@@', '')
+        translation_out = self._detok.detokenize(translation_out.split())
         result = {OutputKeys.TRANSLATION: translation_out}
         return result
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index 067c1d83..62ae91a8 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -241,8 +241,10 @@ def input_fn(src_file,
     trg_dataset = tf.data.TextLineDataset(trg_file)
     src_trg_dataset = tf.data.Dataset.zip((src_dataset, trg_dataset))
     src_trg_dataset = src_trg_dataset.map(
-        lambda src, trg:
-        (tf.string_split([src]).values, tf.string_split([trg]).values),
+        lambda src, trg: (tf.string_split([src]), tf.string_split([trg])),
+        num_parallel_calls=10).prefetch(1000000)
+    src_trg_dataset = src_trg_dataset.map(
+        lambda src, trg: (src.values, trg.values),
         num_parallel_calls=10).prefetch(1000000)
     src_trg_dataset = src_trg_dataset.map(
         lambda src, trg: (src_vocab.lookup(src), trg_vocab.lookup(trg)),
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 6bd56aff..ada4fc50 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,11 +1,14 @@
 en_core_web_sm>=2.3.5
 fairseq>=0.10.2
+jieba>=0.42.1
 pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
+sacremoses>=0.0.41
 seqeval
 spacy>=2.3.5
+subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index c852b1ff..bb6022ec 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -7,18 +7,26 @@ from modelscope.utils.test_utils import test_level
 
 
 class TranslationTest(unittest.TestCase):
-    model_id = 'damo/nlp_csanmt_translation_zh2en'
-    inputs = '声明 补充 说 ， 沃伦 的 同事 都 深感 震惊 ， 并且 希望 他 能够 投@@ 案@@ 自@@ 首 。'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id)
-        print(pipeline_ins(input=self.inputs))
+    def test_run_with_model_name_for_zh2en(self):
+        model_id = 'damo/nlp_csanmt_translation_zh2en'
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
+        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2zh(self):
+        model_id = 'damo/nlp_csanmt_translation_en2zh'
+        inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
+        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
         pipeline_ins = pipeline(task=Tasks.translation)
-        print(pipeline_ins(input=self.inputs))
+        print(pipeline_ins(input=inputs))
 
 
 if __name__ == '__main__':

From fbde374659b31466f48124c79cc26c852553ca9f Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 30 Aug 2022 23:17:07 +0800
Subject: [PATCH 027/175] [to #42322933] add regress tests

Add regression test for some unit tests.
Firstly, Run a baseline test to create a pickle file which contains the inputs and outputs of modules, then changes can be observed between
the latest version and the baseline file.
Some baseline files are submitted in the data/test/regression folder
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9814693
---
 .gitattributes                                |   2 +
 data/test/regression/fill_mask_bert_zh.bin    |   3 +
 data/test/regression/fill_mask_sbert_en.bin   |   3 +
 data/test/regression/fill_mask_sbert_zh.bin   |   3 +
 data/test/regression/fill_mask_veco_en.bin    |   3 +
 data/test/regression/fill_mask_veco_zh.bin    |   3 +
 data/test/regression/sbert_nli.bin            |   3 +
 data/test/regression/sbert_sen_sim.bin        |   3 +
 data/test/regression/sbert_ws_en.bin          |   3 +
 data/test/regression/sbert_ws_zh.bin          |   3 +
 data/test/regression/sbert_zero_shot.bin      |   3 +
 modelscope/utils/regress_test_utils.py        | 703 ++++++++++++++++++
 tests/pipelines/test_fill_mask.py             |  23 +-
 tests/pipelines/test_nli.py                   |   7 +-
 tests/pipelines/test_sentence_similarity.py   |   6 +-
 .../test_sentiment_classification.py          |   1 -
 tests/pipelines/test_word_segmentation.py     |  11 +-
 .../test_zero_shot_classification.py          |   9 +-
 tests/run.py                                  |   1 +
 19 files changed, 777 insertions(+), 16 deletions(-)
 create mode 100644 data/test/regression/fill_mask_bert_zh.bin
 create mode 100644 data/test/regression/fill_mask_sbert_en.bin
 create mode 100644 data/test/regression/fill_mask_sbert_zh.bin
 create mode 100644 data/test/regression/fill_mask_veco_en.bin
 create mode 100644 data/test/regression/fill_mask_veco_zh.bin
 create mode 100644 data/test/regression/sbert_nli.bin
 create mode 100644 data/test/regression/sbert_sen_sim.bin
 create mode 100644 data/test/regression/sbert_ws_en.bin
 create mode 100644 data/test/regression/sbert_ws_zh.bin
 create mode 100644 data/test/regression/sbert_zero_shot.bin
 create mode 100644 modelscope/utils/regress_test_utils.py

diff --git a/.gitattributes b/.gitattributes
index 60ff0dd2..1a3015ec 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,4 +4,6 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/data/test/regression/fill_mask_bert_zh.bin b/data/test/regression/fill_mask_bert_zh.bin
new file mode 100644
index 00000000..17c28b81
--- /dev/null
+++ b/data/test/regression/fill_mask_bert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce
+size 61883
diff --git a/data/test/regression/fill_mask_sbert_en.bin b/data/test/regression/fill_mask_sbert_en.bin
new file mode 100644
index 00000000..09aaf300
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7
+size 119940
diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin
new file mode 100644
index 00000000..812f7ba2
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
+size 119940
diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin
new file mode 100644
index 00000000..be3fddc8
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
+size 119619
diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin
new file mode 100644
index 00000000..c0d27e20
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
+size 119619
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
new file mode 100644
index 00000000..a5f680bb
--- /dev/null
+++ b/data/test/regression/sbert_nli.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
+size 62231
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
new file mode 100644
index 00000000..a59cbe0b
--- /dev/null
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
+size 62235
diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
new file mode 100644
index 00000000..4eb562d6
--- /dev/null
+++ b/data/test/regression/sbert_ws_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
+size 60801
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
new file mode 100644
index 00000000..555f640d
--- /dev/null
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
+size 60801
diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin
new file mode 100644
index 00000000..23d40946
--- /dev/null
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
+size 61589
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
new file mode 100644
index 00000000..ca50d579
--- /dev/null
+++ b/modelscope/utils/regress_test_utils.py
@@ -0,0 +1,703 @@
+import contextlib
+import hashlib
+import os
+import pickle
+import random
+import shutil
+import tempfile
+from collections.abc import Mapping
+from pathlib import Path
+from types import FunctionType
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch.optim
+from torch import nn
+
+
+class RegressTool:
+    """This class is used to stop inference/training results from changing by some unaware affections by unittests.
+
+    Firstly, run a baseline test to create a result file, then changes can be observed between
+    the latest version and the baseline file.
+    """
+
+    def __init__(self,
+                 baseline: bool = None,
+                 store_func: FunctionType = None,
+                 load_func: FunctionType = None):
+        """A func to store the baseline file and a func to load the baseline file.
+        """
+        self.baseline = baseline
+        self.store_func = store_func
+        self.load_func = load_func
+        print(f'Current working dir is: {Path.cwd()}')
+
+    def store(self, local, remote):
+        if self.store_func is not None:
+            self.store_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            os.makedirs(path, exist_ok=True)
+            shutil.copy(local, os.path.join(path, remote))
+
+    def load(self, local, remote):
+        if self.load_func is not None:
+            self.load_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            baseline = os.path.join(path, remote)
+            if not os.path.exists(baseline):
+                raise ValueError(f'base line file {baseline} not exist')
+            print(
+                f'local file found:{baseline}, md5:{hashlib.md5(open(baseline,"rb").read()).hexdigest()}'
+            )
+            if os.path.exists(local):
+                os.remove(local)
+            os.symlink(baseline, local, target_is_directory=False)
+
+    @contextlib.contextmanager
+    def monitor_module_single_forward(self,
+                                      module: nn.Module,
+                                      file_name: str,
+                                      compare_fn=None):
+        """Monitor a pytorch module in a single forward.
+
+        @param module: A torch module
+        @param file_name: The file_name to store or load file
+        @param compare_fn: A custom fn used to compare the results manually.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of submodules
+        type is in one of 'input', 'output'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+        io_json = {}
+        absolute_path = f'./{file_name}.bin'
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        hack_forward(module, file_name, io_json)
+        intercept_module(module, io_json)
+        yield
+        hack_forward(module, None, None, restore=True)
+        intercept_module(module, None, restore=True)
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(io_json, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            class NumpyEncoder(json.JSONEncoder):
+                """Special json encoder for numpy types
+                """
+
+                def default(self, obj):
+                    if isinstance(obj, np.integer):
+                        return int(obj)
+                    elif isinstance(obj, np.floating):
+                        return float(obj)
+                    elif isinstance(obj, np.ndarray):
+                        return obj.tolist()
+                    return json.JSONEncoder.default(self, obj)
+
+            print(f'baseline: {json.dumps(baseline_json, cls=NumpyEncoder)}')
+            print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
+            if not compare_io_and_print(baseline_json, io_json, compare_fn):
+                raise ValueError('Result not match!')
+
+    @contextlib.contextmanager
+    def monitor_module_train(self,
+                             trainer: Union[Dict, Any],
+                             file_name,
+                             level='config',
+                             compare_fn=None,
+                             ignore_keys=None,
+                             compare_random=True,
+                             lazy_stop_callback=None):
+        """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
+
+        This is usually useful when you try to change some dangerous code
+        which has the risk of affecting the training loop.
+
+        @param trainer: A dict or an object contains the model/optimizer/lr_scheduler
+        @param file_name: The file_name to store or load file
+        @param level: The regression level.
+            'strict' for matching every single tensor.
+                     Please make sure the parameters of head are fixed
+                     and the drop-out rate is zero.
+            'config' for matching the initial config, like cfg file, optimizer param_groups,
+                     lr_scheduler params and the random seed.
+            'metric' for compare the best metrics in the evaluation loop.
+        @param compare_fn: A custom fn used to compare the results manually.
+        @param ignore_keys: The keys to ignore of the named_parameters.
+        @param compare_random: If to compare random setttings, default True.
+        @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of modules/parameters
+        type is in one of 'input', 'output', 'backward', 'optimizer', 'lr_scheduler', 'cfg', 'state'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+
+        io_json = {}
+        bw_json = {}
+        absolute_path = f'./{file_name}.bin'
+
+        if level == 'strict':
+            print(
+                "[Important] The level of regression is 'strict', please make sure your model's parameters are "
+                'fixed and all drop-out rates have been set to zero.')
+
+        assert hasattr(
+            trainer, 'model') or 'model' in trainer, 'model must be in trainer'
+        module = trainer['model'] if isinstance(trainer,
+                                                dict) else trainer.model
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        assert hasattr(
+            trainer, 'optimizer'
+        ) or 'optimizer' in trainer, 'optimizer must be in trainer'
+        assert hasattr(
+            trainer, 'lr_scheduler'
+        ) or 'lr_scheduler' in trainer, 'lr_scheduler must be in trainer'
+        optimizer: torch.optim.Optimizer = trainer['optimizer'] if isinstance(
+            trainer, dict) else trainer.optimizer
+        lr_scheduler: torch.optim.lr_scheduler._LRScheduler = trainer['lr_scheduler'] if isinstance(trainer, dict) \
+            else trainer.lr_scheduler
+        torch_state = numpify_tensor_nested(torch.get_rng_state())
+        np_state = np.random.get_state()
+        random_seed = random.getstate()
+        seed = trainer._seed if hasattr(
+            trainer,
+            '_seed') else trainer.seed if hasattr(trainer, 'seed') else None
+
+        if level == 'strict':
+            hack_forward(module, file_name, io_json)
+            intercept_module(module, io_json)
+        hack_backward(
+            module, optimizer, bw_json, lazy_stop_callback=lazy_stop_callback)
+        yield
+        hack_backward(module, optimizer, None, restore=True)
+        if level == 'strict':
+            hack_forward(module, None, None, restore=True)
+            intercept_module(module, None, restore=True)
+
+        optimizer_dict = optimizer.state_dict()
+        optimizer_dict.pop('state', None)
+        summary = {
+            'forward': io_json,
+            'backward': bw_json,
+            'optimizer': {
+                'type': optimizer.__class__.__name__,
+                'defaults': optimizer.defaults,
+                'state_dict': optimizer_dict
+            },
+            'lr_scheduler': {
+                'type': lr_scheduler.__class__.__name__,
+                'state_dict': lr_scheduler.state_dict()
+            },
+            'cfg': trainer.cfg.to_dict() if hasattr(trainer, 'cfg') else None,
+            'state': {
+                'torch_state': torch_state,
+                'np_state': np_state,
+                'random_seed': random_seed,
+                'seed': seed,
+            }
+        }
+
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(summary, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            if level == 'strict' and not compare_io_and_print(
+                    baseline_json['forward'], io_json, compare_fn):
+                raise RuntimeError('Forward not match!')
+            if not compare_backward_and_print(
+                    baseline_json['backward'],
+                    bw_json,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    level=level):
+                raise RuntimeError('Backward not match!')
+            cfg_opt1 = {
+                'optimizer': baseline_json['optimizer'],
+                'lr_scheduler': baseline_json['lr_scheduler'],
+                'cfg': baseline_json['cfg'],
+                'state': None if not compare_random else baseline_json['state']
+            }
+            cfg_opt2 = {
+                'optimizer': summary['optimizer'],
+                'lr_scheduler': summary['lr_scheduler'],
+                'cfg': summary['cfg'],
+                'state': None if not compare_random else summary['state']
+            }
+            if not compare_cfg_and_optimizers(cfg_opt1, cfg_opt2, compare_fn):
+                raise RuntimeError('Cfg or optimizers not match!')
+
+
+class MsRegressTool(RegressTool):
+
+    class EarlyStopError(Exception):
+        pass
+
+    @contextlib.contextmanager
+    def monitor_ms_train(self,
+                         trainer,
+                         file_name,
+                         level='config',
+                         compare_fn=None,
+                         ignore_keys=None):
+
+        def lazy_stop_callback():
+
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    raise MsRegressTool.EarlyStopError('Test finished.')
+
+            trainer.register_hook(EarlyStopHook())
+
+        def _train_loop(trainer, *args, **kwargs):
+            with self.monitor_module_train(
+                    trainer,
+                    file_name,
+                    level,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    lazy_stop_callback=lazy_stop_callback):
+                try:
+                    return trainer.train_loop_origin(*args, **kwargs)
+                except MsRegressTool.EarlyStopError:
+                    pass
+
+        trainer.train_loop_origin, trainer.train_loop = \
+            trainer.train_loop, type(trainer.train_loop)(_train_loop, trainer)
+        yield
+
+
+def compare_module(module1: nn.Module, module2: nn.Module):
+    for p1, p2 in zip(module1.parameters(), module2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
+    import torch
+    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return type(tensors)({
+            k: numpify_tensor_nested(t, reduction, clip_value)
+            for k, t in tensors.items()
+        })
+    if isinstance(tensors, torch.Tensor):
+        t: np.ndarray = tensors.cpu().numpy()
+        if clip_value is not None:
+            t = np.where(t > clip_value, clip_value, t)
+            t = np.where(t < -clip_value, -clip_value, t)
+        if reduction == 'sum':
+            return t.sum(dtype=np.float)
+        elif reduction == 'mean':
+            return t.mean(dtype=np.float)
+        return t
+    return tensors
+
+
+def detach_tensor_nested(tensors):
+    import torch
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(detach_tensor_nested(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return type(tensors)(
+            {k: detach_tensor_nested(t)
+             for k, t in tensors.items()})
+    if isinstance(tensors, torch.Tensor):
+        return tensors.detach()
+    return tensors
+
+
+def hack_forward(module: nn.Module,
+                 name,
+                 io_json,
+                 restore=False,
+                 keep_tensors=False):
+
+    def _forward(self, *args, **kwargs):
+        ret = self.forward_origin(*args, **kwargs)
+        if keep_tensors:
+            args = numpify_tensor_nested(detach_tensor_nested(args))
+            kwargs = numpify_tensor_nested(detach_tensor_nested(kwargs))
+            output = numpify_tensor_nested(detach_tensor_nested(ret))
+        else:
+            args = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='mean'),
+            }
+            kwargs = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='mean'),
+            }
+            output = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='mean'),
+            }
+
+        io_json[name] = {
+            'input': {
+                'args': args,
+                'kwargs': kwargs,
+            },
+            'output': output,
+        }
+        return ret
+
+    if not restore and not hasattr(module, 'forward_origin'):
+        module.forward_origin, module.forward = module.forward, type(
+            module.forward)(_forward, module)
+    if restore and hasattr(module, 'forward_origin'):
+        module.forward = module.forward_origin
+        del module.forward_origin
+
+
+def hack_backward(module: nn.Module,
+                  optimizer,
+                  io_json,
+                  restore=False,
+                  lazy_stop_callback=None):
+
+    def _step(self, *args, **kwargs):
+        for name, param in module.named_parameters():
+            io_json[name] = {
+                'data': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='mean'),
+                },
+                'grad': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='mean'),
+                }
+            }
+        ret = self.step_origin(*args, **kwargs)
+        for name, param in module.named_parameters():
+            io_json[name]['data_after'] = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='mean'),
+            }
+        if lazy_stop_callback is not None:
+            lazy_stop_callback()
+        return ret
+
+    if not restore and not hasattr(optimizer, 'step_origin'):
+        optimizer.step_origin, optimizer.step = optimizer.step, type(
+            optimizer.state_dict)(_step, optimizer)
+    if restore and hasattr(optimizer, 'step_origin'):
+        optimizer.step = optimizer.step_origin
+        del optimizer.step_origin
+
+
+def intercept_module(module: nn.Module,
+                     io_json,
+                     parent_name=None,
+                     restore=False):
+    for name, module in module.named_children():
+        full_name = parent_name + '.' + name if parent_name is not None else name
+        hack_forward(module, full_name, io_json, restore)
+        intercept_module(module, io_json, full_name, restore)
+
+
+def compare_arguments_nested(print_content, arg1, arg2):
+    type1 = type(arg1)
+    type2 = type(arg2)
+    if type1.__name__ != type2.__name__:
+        if print_content is not None:
+            print(
+                f'{print_content}, type not equal:{type1.__name__} and {type2.__name__}'
+            )
+        return False
+
+    if arg1 is None:
+        return True
+    elif isinstance(arg1, (int, str, bool, np.bool, np.integer, np.str)):
+        if arg1 != arg2:
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (float, np.floating)):
+        if not np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8, equal_nan=True):
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (tuple, list)):
+        if len(arg1) != len(arg2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, length is not equal:{len(arg1)}, {len(arg2)}'
+                )
+            return False
+        if not all([
+                compare_arguments_nested(None, sub_arg1, sub_arg2)
+                for sub_arg1, sub_arg2 in zip(arg1, arg2)
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, Mapping):
+        keys1 = arg1.keys()
+        keys2 = arg2.keys()
+        if len(keys1) != len(keys2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, key length is not equal:{len(keys1)}, {len(keys2)}'
+                )
+            return False
+        if len(set(keys1) - set(keys2)) > 0:
+            if print_content is not None:
+                print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
+            return False
+        if not all([
+                compare_arguments_nested(None, arg1[key], arg2[key])
+                for key in keys1
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, np.ndarray):
+        arg1 = np.where(np.equal(arg1, None), np.NaN,
+                        arg1).astype(dtype=np.float)
+        arg2 = np.where(np.equal(arg2, None), np.NaN,
+                        arg2).astype(dtype=np.float)
+        if not all(
+                np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8,
+                           equal_nan=True).flatten()):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    else:
+        raise ValueError(f'type not supported: {type1}')
+
+
+def compare_io_and_print(baseline_json, io_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(io_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        v1 = baseline_json[key]
+        v2 = io_json[key]
+
+        v1input = numpify_tensor_nested(v1['input'])
+        v2input = numpify_tensor_nested(v2['input'])
+        res = compare_fn(v1input, v2input, key, 'input')
+        if res is not None:
+            print(
+                f'input of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(
+                f'unmatched module {key} input args', v1input['args'],
+                v2input['args']) and match
+            match = compare_arguments_nested(
+                f'unmatched module {key} input kwargs', v1input['kwargs'],
+                v2input['kwargs']) and match
+        v1output = numpify_tensor_nested(v1['output'])
+        v2output = numpify_tensor_nested(v2['output'])
+        res = compare_fn(v1output, v2output, key, 'output')
+        if res is not None:
+            print(
+                f'output of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(f'unmatched module {key} outputs',
+                                             v1output, v2output) and match
+    return match
+
+
+def compare_backward_and_print(baseline_json,
+                               bw_json,
+                               level,
+                               ignore_keys=None,
+                               compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(bw_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched backward keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        if ignore_keys is not None and key in ignore_keys:
+            continue
+
+        res = compare_fn(baseline_json[key], bw_json[key], key, 'backward')
+        if res is not None:
+            print(f'backward data of {key} compared with '
+                  f'user compare_fn with result:{res}\n')
+            match = match and res
+        else:
+            data1, grad1, data_after1 = baseline_json[key][
+                'data'], baseline_json[key]['grad'], baseline_json[key][
+                    'data_after']
+            data2, grad2, data_after2 = bw_json[key]['data'], bw_json[key][
+                'grad'], bw_json[key]['data_after']
+            match = compare_arguments_nested(
+                f'unmatched module {key} tensor data', data1, data2) and match
+            if level == 'strict':
+                match = compare_arguments_nested(
+                    f'unmatched module {key} grad data', grad1,
+                    grad2) and match
+                match = compare_arguments_nested(
+                    f'unmatched module {key} data after step', data_after1,
+                    data_after2) and match
+    return match
+
+
+def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    optimizer1, lr_scheduler1, cfg1, state1 = baseline_json[
+        'optimizer'], baseline_json['lr_scheduler'], baseline_json[
+            'cfg'], baseline_json['state']
+    optimizer2, lr_scheduler2, cfg2, state2 = cfg_json['optimizer'], cfg_json[
+        'lr_scheduler'], cfg_json['cfg'], baseline_json['state']
+
+    match = True
+    res = compare_fn(optimizer1, optimizer2, None, 'optimizer')
+    if res is not None:
+        print(f'optimizer compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if optimizer1['type'] != optimizer2['type']:
+            print(
+                f"Optimizer type not equal:{optimizer1['type']} and {optimizer2['type']}"
+            )
+        match = compare_arguments_nested('unmatched optimizer defaults',
+                                         optimizer1['defaults'],
+                                         optimizer2['defaults']) and match
+        match = compare_arguments_nested('unmatched optimizer state_dict',
+                                         optimizer1['state_dict'],
+                                         optimizer2['state_dict']) and match
+
+    res = compare_fn(lr_scheduler1, lr_scheduler2, None, 'lr_scheduler')
+    if res is not None:
+        print(
+            f'lr_scheduler compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if lr_scheduler1['type'] != lr_scheduler2['type']:
+            print(
+                f"Optimizer type not equal:{lr_scheduler1['type']} and {lr_scheduler2['type']}"
+            )
+        match = compare_arguments_nested('unmatched lr_scheduler state_dict',
+                                         lr_scheduler1['state_dict'],
+                                         lr_scheduler2['state_dict']) and match
+
+    res = compare_fn(cfg1, cfg2, None, 'cfg')
+    if res is not None:
+        print(f'cfg compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched cfg', cfg1, cfg2) and match
+
+    res = compare_fn(state1, state2, None, 'state')
+    if res is not None:
+        print(
+            f'random state compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched random state', state1,
+                                         state2) and match
+
+    return match
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 2f57b2d8..1b709e27 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -9,6 +9,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -37,6 +38,7 @@ class FillMaskTest(unittest.TestCase):
         'Everything in [MASK] you call reality is really [MASK] a reflection of your '
         '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
     }
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
@@ -98,9 +100,11 @@ class FillMaskTest(unittest.TestCase):
                 second_sequence=None)
             pipeline_ins = pipeline(
                 task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
-            print(
-                f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-                f'{pipeline_ins(self.test_inputs[language])}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_sbert_{language}'):
+                print(
+                    f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+                    f'{pipeline_ins(self.test_inputs[language])}\n')
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
@@ -111,8 +115,11 @@ class FillMaskTest(unittest.TestCase):
         for language in ['zh', 'en']:
             ori_text = self.ori_texts[language]
             test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
-            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-                  f'{pipeline_ins(test_input)}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_veco_{language}'):
+                print(
+                    f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                    f'{pipeline_ins(test_input)}\n')
 
         # zh bert
         model = Model.from_pretrained(self.model_id_bert)
@@ -123,8 +130,10 @@ class FillMaskTest(unittest.TestCase):
         language = 'zh'
         ori_text = self.ori_texts[language]
         test_input = self.test_inputs[language]
-        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-              f'{pipeline_ins(test_input)}\n')
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'fill_mask_bert_zh'):
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 1e259a2e..1d3fba12 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -15,6 +16,7 @@ class NLITest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_nli_chinese-base'
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
@@ -26,7 +28,6 @@ class NLITest(unittest.TestCase):
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
-        print()
         print(
             f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
             f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
@@ -42,7 +43,9 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_nli'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index d39f6783..6990bf75 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -15,6 +16,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
@@ -47,7 +49,9 @@ class SentenceSimilarityTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_sen_sim'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index f3bc6981..35c96282 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -30,7 +30,6 @@ class SentimentClassificationTaskModelTest(unittest.TestCase):
             preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
-        print()
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1: {pipeline2(input=self.sentence1)}')
 
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index c332d987..87006f96 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -9,6 +9,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -16,6 +17,7 @@ class WordSegmentationTest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
     sentence = '今天天气不错，适合出去游玩'
     sentence_eng = 'I am a program.'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
@@ -27,7 +29,6 @@ class WordSegmentationTest(unittest.TestCase):
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
-        print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -42,8 +43,12 @@ class WordSegmentationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=self.model_id)
-        print(pipeline_ins(input=self.sentence))
-        print(pipeline_ins(input=self.sentence_eng))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_zh'):
+            print(pipeline_ins(input=self.sentence))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_en'):
+            print(pipeline_ins(input=self.sentence_eng))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index 7620a0ed..f0f2a481 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -16,6 +17,7 @@ class ZeroShotClassificationTest(unittest.TestCase):
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
     template = '这篇文章的标题是{}'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
@@ -33,7 +35,6 @@ class ZeroShotClassificationTest(unittest.TestCase):
             f'sentence: {self.sentence}\n'
             f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
         )
-        print()
         print(
             f'sentence: {self.sentence}\n'
             f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'
@@ -53,7 +54,11 @@ class ZeroShotClassificationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.zero_shot_classification, model=self.model_id)
-        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_zero_shot'):
+            print(
+                pipeline_ins(
+                    input=self.sentence, candidate_labels=self.labels))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/run.py b/tests/run.py
index 1a601eda..79509745 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -334,6 +334,7 @@ if __name__ == '__main__':
         help='Save result to directory, internal use only')
     args = parser.parse_args()
     set_test_level(args.level)
+    os.environ['REGRESSION_BASELINE'] = '1'
     logger.info(f'TEST LEVEL: {test_level()}')
     if not args.disable_profile:
         from utils import profiler

From 681ea8cd17bdc47fbce0235afa38afd3f8c3ded7 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 31 Aug 2022 12:57:14 +0800
Subject: [PATCH 028/175] [to #42322933] disable image diffusion tests

---
 tests/pipelines/test_image2image_generation.py  | 2 +-
 tests/pipelines/test_image2image_translation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/test_image2image_generation.py b/tests/pipelines/test_image2image_generation.py
index 487fe4d0..116cef76 100644
--- a/tests/pipelines/test_image2image_generation.py
+++ b/tests/pipelines/test_image2image_generation.py
@@ -11,7 +11,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageGenerationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide two generation modes, i.e., Similar Image Generation and Interpolation.
             You can pass the following parameters for different mode.
diff --git a/tests/pipelines/test_image2image_translation.py b/tests/pipelines/test_image2image_translation.py
index fd2f8063..a1cdb957 100644
--- a/tests/pipelines/test_image2image_translation.py
+++ b/tests/pipelines/test_image2image_translation.py
@@ -8,7 +8,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageTranslationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide three translation modes, i.e., uncropping, colorization and combination.
             You can pass the following parameters for different mode.

From 39730f40fee921e5cda1859767319d78df286cfe Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 31 Aug 2022 16:28:31 +0800
Subject: [PATCH 029/175] [to #42322933] support get_cache_dir with model id

---
 modelscope/fileio/format/json.py | 3 ++-
 modelscope/hub/utils/utils.py    | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py
index f615366f..9979c023 100644
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import jsonplus
 import numpy as np
 
 from .base import FormatHandler
@@ -25,11 +24,13 @@ class JsonHandler(FormatHandler):
     """Use jsonplus, serialization of Python types to JSON that "just works"."""
 
     def load(self, file):
+        import jsonplus
         return jsonplus.loads(file.read())
 
     def dump(self, obj, file, **kwargs):
         file.write(self.dumps(obj, **kwargs))
 
     def dumps(self, obj, **kwargs):
+        import jsonplus
         kwargs.setdefault('default', set_default)
         return jsonplus.dumps(obj, **kwargs)
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 8faf8f1d..7e219d16 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,5 +1,6 @@
 import hashlib
 import os
+from typing import Optional
 
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
                                       DEFAULT_MODELSCOPE_DOMAIN,
@@ -23,14 +24,16 @@ def model_id_to_group_owner_name(model_id):
     return group_or_owner, name
 
 
-def get_cache_dir():
+def get_cache_dir(model_id: Optional[str] = None):
     """
     cache dir precedence:
         function parameter > enviroment > ~/.cache/modelscope/hub
     """
     default_cache_dir = get_default_cache_dir()
-    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
-                                                      'hub'))
+    base_path = os.getenv('MODELSCOPE_CACHE',
+                          os.path.join(default_cache_dir, 'hub'))
+    return base_path if model_id is None else os.path.join(
+        base_path, model_id + '/')
 
 
 def get_endpoint():

From a9deb3895c1f602ef85d43c89fbf6013c94bac5f Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Wed, 31 Aug 2022 20:54:20 +0800
Subject: [PATCH 030/175] =?UTF-8?q?[to=20#42322933]=20movie=20scene=20segm?=
 =?UTF-8?q?entation=E6=A8=A1=E5=9E=8B=E6=8E=A5=E5=85=A5=20=20=20=20=20=20?=
 =?UTF-8?q?=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib?=
 =?UTF-8?q?/codereview/9872869?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../movie_scene_segmentation_test_video.mp4   |   3 +
 modelscope/metainfo.py                        |   6 +
 modelscope/metrics/__init__.py                |   2 +
 modelscope/metrics/builder.py                 |   1 +
 .../movie_scene_segmentation_metric.py        |  52 +++
 modelscope/models/cv/__init__.py              |   5 +-
 .../cv/movie_scene_segmentation/__init__.py   |  25 ++
 .../cv/movie_scene_segmentation/get_model.py  |  45 +++
 .../cv/movie_scene_segmentation/model.py      | 192 ++++++++++
 .../utils/__init__.py                         |   3 +
 .../cv/movie_scene_segmentation/utils/head.py |  29 ++
 .../movie_scene_segmentation/utils/save_op.py | 118 +++++++
 .../utils/shot_encoder.py                     | 331 ++++++++++++++++++
 .../cv/movie_scene_segmentation/utils/trn.py  | 132 +++++++
 .../msdatasets/task_datasets/__init__.py      |   3 +
 .../movie_scene_segmentation/__init__.py      |   1 +
 .../movie_scene_segmentation_dataset.py       | 173 +++++++++
 .../movie_scene_segmentation/sampler.py       | 102 ++++++
 modelscope/outputs.py                         |  18 +
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/cv/__init__.py           |   6 +-
 .../cv/movie_scene_segmentation_pipeline.py   |  67 ++++
 modelscope/preprocessors/__init__.py          |   4 +-
 .../movie_scene_segmentation/__init__.py      |  19 +
 .../movie_scene_segmentation/transforms.py    | 312 +++++++++++++++++
 modelscope/preprocessors/video.py             |  45 +++
 modelscope/trainers/__init__.py               |   5 +-
 modelscope/trainers/cv/__init__.py            |   2 +
 .../cv/movie_scene_segmentation_trainer.py    |  20 ++
 modelscope/utils/constant.py                  |   1 +
 requirements/cv.txt                           |   1 +
 tests/msdatasets/test_ms_dataset.py           |   6 +
 .../test_movie_scene_segmentation.py          |  36 ++
 .../test_movie_scene_segmentation_trainer.py  | 109 ++++++
 34 files changed, 1870 insertions(+), 7 deletions(-)
 create mode 100644 data/test/videos/movie_scene_segmentation_test_video.mp4
 create mode 100644 modelscope/metrics/movie_scene_segmentation_metric.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/get_model.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/model.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/utils/head.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
 create mode 100644 modelscope/models/cv/movie_scene_segmentation/utils/trn.py
 create mode 100644 modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
 create mode 100644 modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
 create mode 100644 modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
 create mode 100644 modelscope/preprocessors/movie_scene_segmentation/__init__.py
 create mode 100644 modelscope/preprocessors/movie_scene_segmentation/transforms.py
 create mode 100644 modelscope/trainers/cv/movie_scene_segmentation_trainer.py
 create mode 100644 tests/pipelines/test_movie_scene_segmentation.py
 create mode 100644 tests/trainers/test_movie_scene_segmentation_trainer.py

diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4
new file mode 100644
index 00000000..ee6ed528
--- /dev/null
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
+size 126815483
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 908ee011..f1179be8 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -27,6 +27,7 @@ class Models(object):
     video_summarization = 'pgl-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
+    resnet50_bert = 'resnet50-bert'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -133,6 +134,7 @@ class Pipelines(object):
     video_summarization = 'googlenet_pgl_video_summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
@@ -195,6 +197,7 @@ class Trainers(object):
     image_instance_segmentation = 'image-instance-segmentation'
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
+    movie_scene_segmentation = 'movie-scene-segmentation'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -223,6 +226,7 @@ class Preprocessors(object):
     image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
     image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
     video_summarization_preprocessor = 'video-summarization-preprocessor'
+    movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -279,6 +283,8 @@ class Metrics(object):
     # metrics for image-portrait-enhancement task
     image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
     video_summarization_metric = 'video-summarization-metric'
+    # metric for movie-scene-segmentation task
+    movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
 
 
 class Optimizers(object):
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index c74b475e..d3975a2c 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
     from .text_generation_metric import TextGenerationMetric
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
+    from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
 
 else:
     _import_structure = {
@@ -32,6 +33,7 @@ else:
         'text_generation_metric': ['TextGenerationMetric'],
         'token_classification_metric': ['TokenClassificationMetric'],
         'video_summarization_metric': ['VideoSummarizationMetric'],
+        'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 869a1ab2..800e3508 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -34,6 +34,7 @@ task_default_metrics = {
     Tasks.video_summarization: [Metrics.video_summarization_metric],
     Tasks.image_captioning: [Metrics.text_gen_metric],
     Tasks.visual_question_answering: [Metrics.text_gen_metric],
+    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
 }
 
 
diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
new file mode 100644
index 00000000..56bdbd1c
--- /dev/null
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -0,0 +1,52 @@
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.movie_scene_segmentation_metric)
+class MovieSceneSegmentationMetric(Metric):
+    """The metric computation class for movie scene segmentation classes.
+    """
+
+    def __init__(self):
+        self.preds = []
+        self.labels = []
+        self.eps = 1e-5
+
+    def add(self, outputs: Dict, inputs: Dict):
+        preds = outputs['pred']
+        labels = inputs['label']
+        self.preds.extend(preds)
+        self.labels.extend(labels)
+
+    def evaluate(self):
+        gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels)))
+        prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds)))
+
+        gt_one = gts == 1
+        gt_zero = gts == 0
+        pred_one = prob == 1
+        pred_zero = prob == 0
+
+        tp = (gt_one * pred_one).sum()
+        fp = (gt_zero * pred_one).sum()
+        fn = (gt_one * pred_zero).sum()
+
+        precision = 100.0 * tp / (tp + fp + self.eps)
+        recall = 100.0 * tp / (tp + fn + self.eps)
+        f1 = 2 * precision * recall / (precision + recall)
+
+        return {
+            MetricKeys.F1: f1,
+            MetricKeys.RECALL: recall,
+            MetricKeys.PRECISION: precision
+        }
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 10040637..331f23bd 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -9,8 +9,9 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
-               object_detection, product_retrieval_embedding,
-               realtime_object_detection, salient_detection, super_resolution,
+               movie_scene_segmentation, object_detection,
+               product_retrieval_embedding, realtime_object_detection,
+               salient_detection, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/movie_scene_segmentation/__init__.py b/modelscope/models/cv/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..25dcda96
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import MovieSceneSegmentationModel
+    from .datasets import MovieSceneSegmentationDataset
+
+else:
+    _import_structure = {
+        'model': ['MovieSceneSegmentationModel'],
+        'datasets': ['MovieSceneSegmentationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/movie_scene_segmentation/get_model.py b/modelscope/models/cv/movie_scene_segmentation/get_model.py
new file mode 100644
index 00000000..5c66fc02
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/get_model.py
@@ -0,0 +1,45 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+from .utils.shot_encoder import resnet50
+from .utils.trn import TransformerCRN
+
+
+def get_shot_encoder(cfg):
+    name = cfg['model']['shot_encoder']['name']
+    shot_encoder_args = cfg['model']['shot_encoder'][name]
+    if name == 'resnet':
+        depth = shot_encoder_args['depth']
+        if depth == 50:
+            shot_encoder = resnet50(**shot_encoder_args['params'], )
+        else:
+            raise NotImplementedError
+    else:
+        raise NotImplementedError
+
+    return shot_encoder
+
+
+def get_contextual_relation_network(cfg):
+    crn = None
+
+    if cfg['model']['contextual_relation_network']['enabled']:
+        name = cfg['model']['contextual_relation_network']['name']
+        crn_args = cfg['model']['contextual_relation_network']['params'][name]
+        if name == 'trn':
+            sampling_name = cfg['model']['loss']['sampling_method']['name']
+            crn_args['neighbor_size'] = (
+                2 * cfg['model']['loss']['sampling_method']['params']
+                [sampling_name]['neighbor_size'])
+            crn = TransformerCRN(crn_args)
+        else:
+            raise NotImplementedError
+
+    return crn
+
+
+__all__ = ['get_shot_encoder', 'get_contextual_relation_network']
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
new file mode 100644
index 00000000..e9576963
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -0,0 +1,192 @@
+import os
+import os.path as osp
+from typing import Any, Dict
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as TF
+from PIL import Image
+from shotdetect_scenedetect_lgss import shot_detect
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .get_model import get_contextual_relation_network, get_shot_encoder
+from .utils.save_op import get_pred_boundary, pred2scene, scene2video
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        params = torch.load(model_path, map_location='cpu')
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+
+        def load_param_with_prefix(prefix, model, src_params):
+            own_state = model.state_dict()
+            for name, param in own_state.items():
+                src_name = prefix + '.' + name
+                own_state[name] = src_params[src_name]
+
+            model.load_state_dict(own_state)
+
+        self.shot_encoder = get_shot_encoder(self.cfg)
+        load_param_with_prefix('shot_encoder', self.shot_encoder, params)
+        self.crn = get_contextual_relation_network(self.cfg)
+        load_param_with_prefix('crn', self.crn, params)
+
+        crn_name = self.cfg.model.contextual_relation_network.name
+        hdim = self.cfg.model.contextual_relation_network.params[crn_name][
+            'hidden_size']
+        self.head_sbd = nn.Linear(hdim, 2)
+        load_param_with_prefix('head_sbd', self.head_sbd, params)
+
+        self.test_transform = TF.Compose([
+            TF.Resize(size=256, interpolation=Image.BICUBIC),
+            TF.CenterCrop(224),
+            TF.ToTensor(),
+            TF.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
+        sampling_method = self.cfg.dataset.sampling_method.name
+        self.neighbor_size = self.cfg.dataset.sampling_method.params[
+            sampling_method].neighbor_size
+
+        self.eps = 1e-5
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        data = inputs['video']
+        labels = inputs['label']
+        outputs = self.shared_step(data)
+
+        loss = F.cross_entropy(
+            outputs.squeeze(), labels.squeeze(), reduction='none')
+        lpos = labels == 1
+        lneg = labels == 0
+
+        pp, nn = 1, 1
+        wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps)
+        wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps)
+        w = wp + wn
+        loss = (w * loss).sum()
+
+        probs = torch.argmax(outputs, dim=1)
+
+        re = dict(pred=probs, loss=loss)
+        return re
+
+    def inference(self, batch):
+        logger.info('Begin scene detect ......')
+        bs = self.cfg.pipeline.batch_size_per_gpu
+        sids = batch['sid']
+        inputs = batch['shot_feat']
+
+        shot_num = len(sids)
+        cnt = shot_num // bs + 1
+
+        for i in range(cnt):
+            start = i * bs
+            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
+            input_ = inputs[start:end]
+            sid_ = sids[start:end]
+            input_ = torch.stack(input_)
+            outputs = self.shared_step(input_)  # shape [b,2]
+            prob = F.softmax(outputs, dim=1)
+            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
+            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
+        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
+
+        assert len(self.infer_result['sid']) == len(sids)
+        assert len(self.infer_result['pred']) == len(inputs)
+        return self.infer_result
+
+    def shared_step(self, inputs):
+        with torch.no_grad():
+            # infer shot encoder
+            shot_repr = self.extract_shot_representation(inputs)
+            assert len(shot_repr.shape) == 3
+
+        # infer CRN
+        _, pooled = self.crn(shot_repr, mask=None)
+        # infer boundary score
+        pred = self.head_sbd(pooled)
+        return pred
+
+    def save_shot_feat(self, _repr):
+        feat = _repr.float().cpu().numpy()
+        pth = self.cfg.dataset.img_path + '/features'
+        os.makedirs(pth)
+
+        for idx in range(_repr.shape[0]):
+            name = f'shot_{str(idx).zfill(4)}.npy'
+            name = osp.join(pth, name)
+            np.save(name, feat[idx])
+
+    def extract_shot_representation(self,
+                                    inputs: torch.Tensor) -> torch.Tensor:
+        """ inputs [b s k c h w] -> output [b d] """
+        assert len(inputs.shape) == 6  # (B Shot Keyframe C H W)
+        b, s, k, c, h, w = inputs.shape
+        inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s)
+        keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)]
+        # [k (b s) d] -> [(b s) d]
+        shot_repr = torch.stack(keyframe_repr).mean(dim=0)
+
+        shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s)
+        return shot_repr
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        logger.info('Generate scene .......')
+
+        pred_dict = inputs['feat']
+        thres = self.cfg.pipeline.save_threshold
+
+        anno_dict = get_pred_boundary(pred_dict, thres)
+        scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        if self.cfg.pipeline.save_split_scene:
+            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
+            print(f'Split scene video saved to {re_dir}')
+        return len(scene_list), scene_dict
+
+    def preprocess(self, inputs):
+        logger.info('Begin shot detect......')
+        shot_keyf_lst, anno, shot2keyf = shot_detect(
+            inputs, **self.cfg.preprocessor.shot_detect)
+        logger.info('Shot detect done!')
+
+        single_shot_feat, sid = [], []
+        for idx, one_shot in enumerate(shot_keyf_lst):
+            one_shot = [
+                self.test_transform(one_frame) for one_frame in one_shot
+            ]
+            one_shot = torch.stack(one_shot, dim=0)
+            single_shot_feat.append(one_shot)
+            sid.append(idx)
+        single_shot_feat = torch.stack(single_shot_feat, dim=0)
+        shot_feat = []
+        for idx, one_shot in enumerate(anno):
+            shot_idx = int(one_shot['shot_id']) + np.arange(
+                -self.neighbor_size, self.neighbor_size + 1)
+            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
+            _one_shot = single_shot_feat[shot_idx]
+            shot_feat.append(_one_shot)
+        self.shot2keyf = shot2keyf
+        self.anno = anno
+        return shot_feat, sid
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
new file mode 100644
index 00000000..3682726f
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
@@ -0,0 +1,3 @@
+from .save_op import get_pred_boundary, pred2scene, scene2video
+from .shot_encoder import resnet50
+from .trn import TransformerCRN
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/head.py b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
new file mode 100644
index 00000000..20a87e66
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -0,0 +1,29 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MlpHead(nn.Module):
+
+    def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+
+        self.model = nn.Sequential(
+            nn.Linear(self.input_dim, self.hidden_dim, bias=True),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.output_dim, bias=True),
+        )
+
+    def forward(self, x):
+        # x shape: [b t d] where t means the number of views
+        x = self.model(x)
+        return F.normalize(x, dim=-1)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
new file mode 100644
index 00000000..d7c8c0ed
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -0,0 +1,118 @@
+# ----------------------------------------------------------------------------------
+# The codes below partially refer to the SceneSeg LGSS.
+# Github: https://github.com/AnyiRao/SceneSeg
+# ----------------------------------------------------------------------------------
+import os
+import os.path as osp
+import subprocess
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+
+def get_pred_boundary(pred_dict, threshold=0.5):
+    pred = pred_dict['pred']
+    tmp = (pred > threshold).astype(np.int32)
+    anno_dict = {}
+    for idx in range(len(tmp)):
+        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
+    return anno_dict
+
+
+def pred2scene(shot2keyf, anno_dict):
+    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
+
+    scene_dict = {}
+    assert len(scene_list) == len(pair_list)
+    for scene_ind, scene_item in enumerate(scene_list):
+        scene_dict.update(
+            {scene_ind: {
+                'shot': pair_list[scene_ind],
+                'frame': scene_item
+            }})
+
+    return scene_dict, scene_list
+
+
+def scene2video(source_movie_fn, scene_list, thres):
+
+    vcap = cv2.VideoCapture(source_movie_fn)
+    fps = vcap.get(cv2.CAP_PROP_FPS)  # video.fps
+    out_video_dir_fn = os.path.join(os.getcwd(),
+                                    f'pred_result/scene_video_{thres}')
+    os.makedirs(out_video_dir_fn, exist_ok=True)
+
+    for scene_ind, scene_item in tqdm(enumerate(scene_list)):
+        scene = str(scene_ind).zfill(4)
+        start_frame = int(scene_item[0])
+        end_frame = int(scene_item[1])
+        start_time, end_time = start_frame / fps, end_frame / fps
+        duration_time = end_time - start_time
+        out_video_fn = os.path.join(out_video_dir_fn,
+                                    'scene_{}.mp4'.format(scene))
+        if os.path.exists(out_video_fn):
+            continue
+        call_list = ['ffmpeg']
+        call_list += ['-v', 'quiet']
+        call_list += [
+            '-y', '-ss',
+            str(start_time), '-t',
+            str(duration_time), '-i', source_movie_fn
+        ]
+        call_list += ['-map_chapters', '-1']
+        call_list += [out_video_fn]
+        subprocess.call(call_list)
+    return osp.join(os.getcwd(), 'pred_result')
+
+
+def get_demo_scene_list(shot2keyf, anno_dict):
+    pair_list = get_pair_list(anno_dict)
+
+    scene_list = []
+    for pair in pair_list:
+        start_shot, end_shot = int(pair[0]), int(pair[-1])
+        start_frame = shot2keyf[start_shot].split(' ')[0]
+        end_frame = shot2keyf[end_shot].split(' ')[1]
+        scene_list.append((start_frame, end_frame))
+    return scene_list, pair_list
+
+
+def get_pair_list(anno_dict):
+    sort_anno_dict_key = sorted(anno_dict.keys())
+    tmp = 0
+    tmp_list = []
+    tmp_label_list = []
+    anno_list = []
+    anno_label_list = []
+    for key in sort_anno_dict_key:
+        value = anno_dict.get(key)
+        tmp += value
+        tmp_list.append(key)
+        tmp_label_list.append(value)
+        if tmp == 1:
+            anno_list.append(tmp_list)
+            anno_label_list.append(tmp_label_list)
+            tmp = 0
+            tmp_list = []
+            tmp_label_list = []
+            continue
+        if key == sort_anno_dict_key[-1]:
+            if len(tmp_list) > 0:
+                anno_list.append(tmp_list)
+                anno_label_list.append(tmp_label_list)
+    if len(anno_list) == 0:
+        return None
+    while [] in anno_list:
+        anno_list.remove([])
+    tmp_anno_list = [anno_list[0]]
+    pair_list = []
+    for ind in range(len(anno_list) - 1):
+        cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1])
+        if cont_count > 1:
+            pair_list.extend(tmp_anno_list)
+            tmp_anno_list = [anno_list[ind + 1]]
+            continue
+        tmp_anno_list.append(anno_list[ind + 1])
+    pair_list.extend(tmp_anno_list)
+    return pair_list
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
new file mode 100644
index 00000000..7ad1907f
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -0,0 +1,331 @@
+"""
+Modified from original implementation in torchvision
+"""
+
+from typing import Any, Callable, List, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            groups: int = 1,
+            dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        in_channel_dim: int = 3,
+        zero_init_residual: bool = False,
+        use_last_block_grid: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.use_last_block_grid = use_last_block_grid
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            in_channel_dim,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+        )
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight,
+                                      0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight,
+                                      0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            ))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                ))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool,
+                      grid_only: bool) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        if grid:
+            x_grid = []
+
+        if 3 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.layer4(x)
+
+        if 4 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        if not grid or len(level) == 0:
+            return x
+
+        if grid_only:
+            return x_grid
+
+        if both:
+            return x, x_grid
+
+        return x
+
+    def forward(
+        self,
+        x: Tensor,
+        grid: bool = False,
+        level: List = [],
+        both: bool = False,
+        grid_only: bool = False,
+    ) -> Tensor:
+        return self._forward_impl(x, grid, level, both, grid_only)
+
+
+def resnet50(**kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/trn.py b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
new file mode 100644
index 00000000..769e9ee4
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
@@ -0,0 +1,132 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+from transformers.models.bert.modeling_bert import BertEncoder
+
+
+class ShotEmbedding(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size)
+        self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size)
+        self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0)
+
+        # tf naming convention for layer norm
+        self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(cfg.hidden_dropout_prob)
+
+        self.register_buffer('pos_ids',
+                             torch.arange(nn_size, dtype=torch.long))
+
+    def forward(
+        self,
+        shot_emb: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        assert len(shot_emb.size()) == 3
+
+        if pos_ids is None:
+            pos_ids = self.pos_ids
+
+        # this for mask embedding (un-masked ones remain unchanged)
+        if mask is not None:
+            self.mask_embedding.weight.data[0, :].fill_(0)
+            mask_emb = self.mask_embedding(mask.long())
+            shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb
+
+        # we set [CLS] token to averaged feature
+        cls_emb = shot_emb.mean(dim=1)
+
+        # embedding shots
+        shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1)
+        shot_emb = self.shot_embedding(shot_emb)
+        pos_emb = self.position_embedding(pos_ids)
+        embeddings = shot_emb + pos_emb[None, :]
+        embeddings = self.dropout(self.LayerNorm(embeddings))
+        return embeddings
+
+
+class TransformerCRN(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.pooling_method = cfg.pooling_method
+        self.shot_embedding = ShotEmbedding(cfg)
+        self.encoder = BertEncoder(cfg)
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.register_buffer(
+            'attention_mask',
+            self._get_extended_attention_mask(
+                torch.ones((1, nn_size)).float()),
+        )
+
+    def forward(
+        self,
+        shot: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+        pooling_method: str = None,
+    ):
+        if self.attention_mask.shape[1] != (shot.shape[1] + 1):
+            n_shot = shot.shape[1] + 1  # +1 for CLS token
+            attention_mask = self._get_extended_attention_mask(
+                torch.ones((1, n_shot), dtype=torch.float, device=shot.device))
+        else:
+            attention_mask = self.attention_mask
+
+        shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids)
+        encoded_emb = self.encoder(
+            shot_emb, attention_mask=attention_mask).last_hidden_state
+
+        return encoded_emb, self.pooler(
+            encoded_emb, pooling_method=pooling_method)
+
+    def pooler(self, sequence_output, pooling_method=None):
+        if pooling_method is None:
+            pooling_method = self.pooling_method
+
+        if pooling_method == 'cls':
+            return sequence_output[:, 0, :]
+        elif pooling_method == 'avg':
+            return sequence_output[:, 1:].mean(dim=1)
+        elif pooling_method == 'max':
+            return sequence_output[:, 1:].max(dim=1)[0]
+        elif pooling_method == 'center':
+            cidx = sequence_output.shape[1] // 2
+            return sequence_output[:, cidx, :]
+        else:
+            raise ValueError
+
+    def _get_extended_attention_mask(self, attention_mask):
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f'Wrong shape for attention_mask (shape {attention_mask.shape})'
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 1905bf39..f97ff8b2 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -9,7 +9,9 @@ if TYPE_CHECKING:
     from .torch_base_dataset import TorchTaskDataset
     from .veco_dataset import VecoDataset
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
+    from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+
 else:
     _import_structure = {
         'base': ['TaskDataset'],
@@ -19,6 +21,7 @@ else:
         'image_instance_segmentation_coco_dataset':
         ['ImageInstanceSegmentationCocoDataset'],
         'video_summarization_dataset': ['VideoSummarizationDataset'],
+        'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..e56039ac
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
@@ -0,0 +1 @@
+from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
new file mode 100644
index 00000000..925d6281
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -0,0 +1,173 @@
+# ---------------------------------------------------------------------------------------------------
+# The implementation is built upon BaSSL, publicly available at https://github.com/kakaobrain/bassl
+# ---------------------------------------------------------------------------------------------------
+import copy
+import os
+import os.path as osp
+import random
+
+import json
+import torch
+from torchvision.datasets.folder import pil_loader
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from . import sampler
+
+DATASET_STRUCTURE = {
+    'train': {
+        'annotation': 'anno/train.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    },
+    'test': {
+        'annotation': 'anno/test.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    }
+}
+
+
+@TASK_DATASETS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationDataset(TorchTaskDataset):
+    """dataset for movie scene segmentation.
+
+    Args:
+        split_config (dict): Annotation file path. {"train":"xxxxx"}
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+    """
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+
+        self.split = next(iter(split_config.keys()))
+        self.preprocessor = kwargs['preprocessor']
+
+        self.ann_file = osp.join(self.data_root,
+                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.img_prefix = osp.join(self.data_root,
+                                   DATASET_STRUCTURE[self.split]['images'])
+        self.feat_prefix = osp.join(self.data_root,
+                                    DATASET_STRUCTURE[self.split]['feat'])
+
+        self.test_mode = kwargs['test_mode']
+        if self.test_mode:
+            self.preprocessor.eval()
+        else:
+            self.preprocessor.train()
+
+        self.cfg = kwargs.pop('cfg', None)
+
+        self.num_keyframe = self.cfg.num_keyframe if self.cfg is not None else 3
+        self.use_single_keyframe = self.cfg.use_single_keyframe if self.cfg is not None else False
+
+        self.load_data()
+        self.init_sampler(self.cfg)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.anno_data)
+
+    def __getitem__(self, idx: int):
+        data = self.anno_data[
+            idx]  # {"video_id", "shot_id", "num_shot", "boundary_label"}
+        vid, sid = data['video_id'], data['shot_id']
+        num_shot = data['num_shot']
+
+        shot_idx = self.shot_sampler(int(sid), num_shot)
+
+        video = self.load_shot_list(vid, shot_idx)
+        if self.preprocessor is None:
+            video = torch.stack(video, dim=0)
+            video = video.view(-1, self.num_keyframe, 3, 224, 224)
+        else:
+            video = self.preprocessor(video)
+
+        payload = {
+            'idx': idx,
+            'vid': vid,
+            'sid': sid,
+            'video': video,
+            'label': abs(data['boundary_label']),  # ignore -1 label.
+        }
+        return payload
+
+    def load_data(self):
+        self.tmpl = '{}/shot_{}_img_{}.jpg'  # video_id, shot_id, shot_num
+
+        if not self.test_mode:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+            self.vidsid2label = {
+                f"{it['video_id']}_{it['shot_id']}": it['boundary_label']
+                for it in self.anno_data
+            }
+        else:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+
+    def init_sampler(self, cfg):
+        # shot sampler
+        if cfg is not None:
+            self.sampling_method = cfg.sampling_method.name
+            sampler_args = copy.deepcopy(
+                cfg.sampling_method.params.get(self.sampling_method, {}))
+            if self.sampling_method == 'instance':
+                self.shot_sampler = sampler.InstanceShotSampler()
+            elif self.sampling_method == 'temporal':
+                self.shot_sampler = sampler.TemporalShotSampler(**sampler_args)
+            elif self.sampling_method == 'shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl+shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'sbd':
+                self.shot_sampler = sampler.NeighborShotSampler(**sampler_args)
+            else:
+                raise NotImplementedError
+        else:
+            self.shot_sampler = sampler.NeighborShotSampler()
+
+    def load_shot_list(self, vid, shot_idx):
+        shot_list = []
+        cache = {}
+        for sidx in shot_idx:
+            vidsid = f'{vid}_{sidx:04d}'
+            if vidsid in cache:
+                shot = cache[vidsid]
+            else:
+                shot_path = os.path.join(
+                    self.img_prefix, self.tmpl.format(vid, f'{sidx:04d}',
+                                                      '{}'))
+                shot = self.load_shot_keyframes(shot_path)
+                cache[vidsid] = shot
+            shot_list.extend(shot)
+        return shot_list
+
+    def load_shot_keyframes(self, path):
+        shot = None
+        if not self.test_mode and self.use_single_keyframe:
+            # load one randomly sampled keyframe
+            shot = [
+                pil_loader(
+                    path.format(random.randint(0, self.num_keyframe - 1)))
+            ]
+        else:
+            # load all keyframes
+            shot = [
+                pil_loader(path.format(i)) for i in range(self.num_keyframe)
+            ]
+        assert shot is not None
+        return shot
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
new file mode 100644
index 00000000..0fc2fe0f
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import random
+
+import numpy as np
+
+
+class InstanceShotSampler:
+    """ This is for instance at pre-training stage """
+
+    def __call__(self, center_sid: int, *args, **kwargs):
+        return center_sid
+
+
+class TemporalShotSampler:
+    """ This is for temporal at pre-training stage """
+
+    def __init__(self, neighbor_size: int):
+        self.N = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        """ we randomly sample one shot from neighbor shots within local temporal window
+        """
+        shot_idx = center_sid + np.arange(
+            -self.N, self.N + 1
+        )  # total number of neighbor shots = 2N+1 (query (1) + neighbors (2*N))
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # deal with out-of-boundary indices
+        shot_idx = random.choice(
+            np.unique(np.delete(shot_idx, np.where(shot_idx == center_sid))))
+        return shot_idx
+
+
+class SequenceShotSampler:
+    """ This is for bassl or shotcol at pre-training stage """
+
+    def __init__(self, neighbor_size: int, neighbor_interval: int):
+        self.interval = neighbor_interval
+        self.window_size = neighbor_size * self.interval  # temporal coverage
+
+    def __call__(self,
+                 center_sid: int,
+                 total_num_shot: int,
+                 sparse_method: str = 'edge'):
+        """
+        Args:
+            center_sid: index of center shot
+            total_num_shot: last index of shot for given video
+            sparse_stride: stride to sample sparse ones from dense sequence
+                    for curriculum learning
+        """
+
+        dense_shot_idx = center_sid + np.arange(
+            -self.window_size, self.window_size + 1,
+            self.interval)  # total number of shots = 2*neighbor_size+1
+
+        if dense_shot_idx[0] < 0:
+            # if center_sid is near left-side of video, we shift window rightward
+            # so that the leftmost index is 0
+            dense_shot_idx -= dense_shot_idx[0]
+        elif dense_shot_idx[-1] > (total_num_shot - 1):
+            # if center_sid is near right-side of video, we shift window leftward
+            # so that the rightmost index is total_num_shot - 1
+            dense_shot_idx -= dense_shot_idx[-1] - (total_num_shot - 1)
+
+        # to deal with videos that have smaller number of shots than window size
+        dense_shot_idx = np.clip(dense_shot_idx, 0, total_num_shot)
+
+        if sparse_method == 'edge':
+            # in this case, we use two edge shots as sparse sequence
+            sparse_stride = len(dense_shot_idx) - 1
+            sparse_idx_to_dense = np.arange(0, len(dense_shot_idx),
+                                            sparse_stride)
+        elif sparse_method == 'edge+center':
+            # in this case, we use two edge shots + center shot as sparse sequence
+            sparse_idx_to_dense = np.array(
+                [0, len(dense_shot_idx) - 1,
+                 len(dense_shot_idx) // 2])
+
+        shot_idx = [sparse_idx_to_dense, dense_shot_idx]
+        return shot_idx
+
+
+class NeighborShotSampler:
+    """ This is for scene boundary detection (sbd), i.e., fine-tuning stage """
+
+    def __init__(self, neighbor_size: int = 8):
+        self.neighbor_size = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        # total number of shots = 2 * neighbor_size + 1
+        shot_idx = center_sid + np.arange(-self.neighbor_size,
+                                          self.neighbor_size + 1)
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # for out-of-boundary indices
+
+        return shot_idx
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 1c42a5f3..7c0e08dc 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,8 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    SPLIT_VIDEO_NUM = 'split_video_num'
+    SPLIT_META_DICT = 'split_meta_dict'
 
 
 TASK_OUTPUTS = {
@@ -241,6 +243,22 @@ TASK_OUTPUTS = {
     # }
     Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
 
+    # movide scene segmentation result for a single video
+    # {
+    #        "split_video_num":3,
+    #        "split_meta_dict":
+    #        {
+    #           scene_id:
+    #           {
+    #               "shot": [0,1,2],
+    #               "frame": [start_frame, end_frame]
+    #           }
+    #        }
+    #
+    # }
+    Tasks.movie_scene_segmentation:
+    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_DICT],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 53f55b06..943578fb 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -144,6 +144,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitb_video-single-object-tracking_ostrack'),
     Tasks.image_reid_person: (Pipelines.image_reid_person,
                               'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.movie_scene_segmentation:
+    (Pipelines.movie_scene_segmentation,
+     'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 640ffd4c..bd175578 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -42,6 +42,8 @@ if TYPE_CHECKING:
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -90,7 +92,9 @@ else:
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'easycv_pipeline':
-        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline']
+        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'movie_scene_segmentation_pipeline':
+        ['MovieSceneSegmentationPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
new file mode 100644
index 00000000..0ef0261d
--- /dev/null
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -0,0 +1,67 @@
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.movie_scene_segmentation,
+    module_name=Pipelines.movie_scene_segmentation)
+class MovieSceneSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a movie scene segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            device = 'gpu'
+        else:
+            device = 'cpu'
+        super().__init__(model=model, device=device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """ use pyscenedetect to detect shot from the input video, and generate key-frame jpg, anno.ndjson, and shot-frame.txt
+            Then use shot-encoder to encoder feat of the detected key-frame
+
+        Args:
+            input: path of the input video
+
+        """
+        self.input_video_pth = input
+        if isinstance(input, str):
+            shot_feat, sid = self.model.preprocess(input)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+
+        result = {'sid': sid, 'shot_feat': shot_feat}
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model.inference(input)
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
+        video_num, meta_dict = self.model.postprocess(data)
+        result = {
+            OutputKeys.SPLIT_VIDEO_NUM: video_num,
+            OutputKeys.SPLIT_META_DICT: meta_dict
+        }
+        return result
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index f5ac0e4e..d365b6fa 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
-    from .video import ReadVideoData
+    from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
 
 else:
@@ -37,7 +37,7 @@ else:
         'common': ['Compose', 'ToTensor', 'Filter'],
         'audio': ['LinearAECAndFbank'],
         'asr': ['WavToScp'],
-        'video': ['ReadVideoData'],
+        'video': ['ReadVideoData', 'MovieSceneSegmentationPreprocessor'],
         'image': [
             'LoadImage', 'load_image', 'ImageColorEnhanceFinetunePreprocessor',
             'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
diff --git a/modelscope/preprocessors/movie_scene_segmentation/__init__.py b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..73da792d
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .transforms import get_transform
+else:
+    _import_structure = {
+        'transforms': ['get_transform'],
+    }
+
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/movie_scene_segmentation/transforms.py b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
new file mode 100644
index 00000000..b4e57420
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -0,0 +1,312 @@
+# ------------------------------------------------------------------------------------
+# The codes below partially refer to the BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+import numbers
+import os.path as osp
+import random
+from typing import List
+
+import numpy as np
+import torch
+import torchvision.transforms as TF
+import torchvision.transforms.functional as F
+from PIL import Image, ImageFilter
+
+
+def get_transform(lst):
+    assert len(lst) > 0
+    transform_lst = []
+    for item in lst:
+        transform_lst.append(build_transform(item))
+    transform = TF.Compose(transform_lst)
+    return transform
+
+
+def build_transform(cfg):
+    assert isinstance(cfg, dict)
+    cfg = cfg.copy()
+    type = cfg.pop('type')
+
+    if type == 'VideoResizedCenterCrop':
+        return VideoResizedCenterCrop(**cfg)
+    elif type == 'VideoToTensor':
+        return VideoToTensor(**cfg)
+    elif type == 'VideoRandomResizedCrop':
+        return VideoRandomResizedCrop(**cfg)
+    elif type == 'VideoRandomHFlip':
+        return VideoRandomHFlip()
+    elif type == 'VideoRandomColorJitter':
+        return VideoRandomColorJitter(**cfg)
+    elif type == 'VideoRandomGaussianBlur':
+        return VideoRandomGaussianBlur(**cfg)
+    else:
+        raise NotImplementedError
+
+
+class VideoResizedCenterCrop(torch.nn.Module):
+
+    def __init__(self, image_size, crop_size):
+        self.tfm = TF.Compose([
+            TF.Resize(size=image_size, interpolation=Image.BICUBIC),
+            TF.CenterCrop(crop_size),
+        ])
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.tfm(img) for img in imgmap]
+
+
+class VideoToTensor(torch.nn.Module):
+
+    def __init__(self, mean=None, std=None, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+        assert self.mean is not None
+        assert self.std is not None
+
+    def __to_tensor__(self, img):
+        return F.to_tensor(img)
+
+    def __normalize__(self, img):
+        return F.normalize(img, self.mean, self.std, self.inplace)
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.__normalize__(self.__to_tensor__(img)) for img in imgmap]
+
+
+class VideoRandomResizedCrop(torch.nn.Module):
+
+    def __init__(self, size, bottom_area=0.2):
+        self.p = 1.0
+        self.interpolation = Image.BICUBIC
+        self.size = size
+        self.bottom_area = bottom_area
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:  # do RandomResizedCrop, consistent=True
+            top, left, height, width = TF.RandomResizedCrop.get_params(
+                imgmap[0],
+                scale=(self.bottom_area, 1.0),
+                ratio=(3 / 4.0, 4 / 3.0))
+            return [
+                F.resized_crop(
+                    img=img,
+                    top=top,
+                    left=left,
+                    height=height,
+                    width=width,
+                    size=(self.size, self.size),
+                ) for img in imgmap
+            ]
+        else:
+            return [
+                F.resize(img=img, size=[self.size, self.size])
+                for img in imgmap
+            ]
+
+
+class VideoRandomHFlip(torch.nn.Module):
+
+    def __init__(self, consistent=True, command=None, seq_len=0):
+        self.consistent = consistent
+        if seq_len != 0:
+            self.consistent = False
+        if command == 'left':
+            self.threshold = 0
+        elif command == 'right':
+            self.threshold = 1
+        else:
+            self.threshold = 0.5
+        self.seq_len = seq_len
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if self.consistent:
+            if random.random() < self.threshold:
+                return [i.transpose(Image.FLIP_LEFT_RIGHT) for i in imgmap]
+            else:
+                return imgmap
+        else:
+            result = []
+            for idx, i in enumerate(imgmap):
+                if idx % self.seq_len == 0:
+                    th = random.random()
+                if th < self.threshold:
+                    result.append(i.transpose(Image.FLIP_LEFT_RIGHT))
+                else:
+                    result.append(i)
+            assert len(result) == len(imgmap)
+            return result
+
+
+class VideoRandomColorJitter(torch.nn.Module):
+    """Randomly change the brightness, contrast and saturation of an image.
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness=0,
+        contrast=0,
+        saturation=0,
+        hue=0,
+        consistent=True,
+        p=1.0,
+        seq_len=0,
+    ):
+        self.brightness = self._check_input(brightness, 'brightness')
+        self.contrast = self._check_input(contrast, 'contrast')
+        self.saturation = self._check_input(saturation, 'saturation')
+        self.hue = self._check_input(
+            hue, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+        self.consistent = consistent
+        self.threshold = p
+        self.seq_len = seq_len
+
+    def _check_input(self,
+                     value,
+                     name,
+                     center=1,
+                     bound=(0, float('inf')),
+                     clip_first_on_zero=True):
+        if isinstance(value, numbers.Number):
+            if value < 0:
+                raise ValueError(
+                    'If {} is a single number, it must be non negative.'.
+                    format(name))
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0)
+        elif isinstance(value, (tuple, list)) and len(value) == 2:
+            if not bound[0] <= value[0] <= value[1] <= bound[1]:
+                raise ValueError('{} values should be between {}'.format(
+                    name, bound))
+        else:
+            raise TypeError(
+                '{} should be a single number or a list/tuple with lenght 2.'.
+                format(name))
+
+        # if value is 0 or (1., 1.) for brightness/contrast/saturation
+        # or (0., 0.) for hue, do nothing
+        if value[0] == value[1] == center:
+            value = None
+        return value
+
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+        Arguments are same as that of __init__.
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+
+        if brightness is not None:
+            brightness_factor = random.uniform(brightness[0], brightness[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_brightness(img, brightness_factor)))
+
+        if contrast is not None:
+            contrast_factor = random.uniform(contrast[0], contrast[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_contrast(img, contrast_factor)))
+
+        if saturation is not None:
+            saturation_factor = random.uniform(saturation[0], saturation[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_saturation(img, saturation_factor)))
+
+        if hue is not None:
+            hue_factor = random.uniform(hue[0], hue[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_hue(img, hue_factor)))
+
+        random.shuffle(transforms)
+        transform = TF.Compose(transforms)
+
+        return transform
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.threshold:  # do ColorJitter
+            if self.consistent:
+                transform = self.get_params(self.brightness, self.contrast,
+                                            self.saturation, self.hue)
+
+                return [transform(i) for i in imgmap]
+            else:
+                if self.seq_len == 0:
+                    return [
+                        self.get_params(self.brightness, self.contrast,
+                                        self.saturation, self.hue)(img)
+                        for img in imgmap
+                    ]
+                else:
+                    result = []
+                    for idx, img in enumerate(imgmap):
+                        if idx % self.seq_len == 0:
+                            transform = self.get_params(
+                                self.brightness,
+                                self.contrast,
+                                self.saturation,
+                                self.hue,
+                            )
+                        result.append(transform(img))
+                    return result
+
+        else:
+            return imgmap
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += 'brightness={0}'.format(self.brightness)
+        format_string += ', contrast={0}'.format(self.contrast)
+        format_string += ', saturation={0}'.format(self.saturation)
+        format_string += ', hue={0})'.format(self.hue)
+        return format_string
+
+
+class VideoRandomGaussianBlur(torch.nn.Module):
+
+    def __init__(self, radius_min=0.1, radius_max=2.0, p=0.5):
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+        self.p = p
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:
+            result = []
+            for _, img in enumerate(imgmap):
+                _radius = random.uniform(self.radius_min, self.radius_max)
+                result.append(
+                    img.filter(ImageFilter.GaussianBlur(radius=_radius)))
+            return result
+        else:
+            return imgmap
+
+
+def apply_transform(images, trans):
+    return torch.stack(trans(images), dim=0)
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index 36110d1b..0d2e8c3e 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -9,6 +9,12 @@ import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose
 
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
+
 
 def ReadVideoData(cfg, video_path):
     """ simple interface to load video frames from file
@@ -227,3 +233,42 @@ class KineticsResizedCrop(object):
 
     def __call__(self, clip):
         return self._get_controlled_crop(clip)
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.movie_scene_segmentation_preprocessor)
+class MovieSceneSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """
+        movie scene segmentation preprocessor
+        """
+        super().__init__(*args, **kwargs)
+
+        self.is_train = kwargs.pop('is_train', True)
+        self.preprocessor_train_cfg = kwargs.pop(ModeKeys.TRAIN, None)
+        self.preprocessor_test_cfg = kwargs.pop(ModeKeys.EVAL, None)
+        self.num_keyframe = kwargs.pop('num_keyframe', 3)
+
+        from .movie_scene_segmentation import get_transform
+        self.train_transform = get_transform(self.preprocessor_train_cfg)
+        self.test_transform = get_transform(self.preprocessor_test_cfg)
+
+    def train(self):
+        self.is_train = True
+        return
+
+    def eval(self):
+        self.is_train = False
+        return
+
+    @type_assert(object, object)
+    def __call__(self, results):
+        if self.is_train:
+            transforms = self.train_transform
+        else:
+            transforms = self.test_transform
+
+        results = torch.stack(transforms(results), dim=0)
+        results = results.view(-1, self.num_keyframe, 3, 224, 224)
+        return results
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 32ff674f..8f8938c8 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -8,7 +8,8 @@ if TYPE_CHECKING:
     from .base import DummyTrainer
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
-                     ImagePortraitEnhancementTrainer)
+                     ImagePortraitEnhancementTrainer,
+                     MovieSceneSegmentationTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
@@ -21,7 +22,7 @@ else:
         'builder': ['build_trainer'],
         'cv': [
             'ImageInstanceSegmentationTrainer',
-            'ImagePortraitEnhancementTrainer'
+            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer'],
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 99c2aea5..4c65870e 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_trainer import \
         ImageInstanceSegmentationTrainer
     from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
+    from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
 
 else:
     _import_structure = {
@@ -14,6 +15,7 @@ else:
         ['ImageInstanceSegmentationTrainer'],
         'image_portrait_enhancement_trainer':
         ['ImagePortraitEnhancementTrainer'],
+        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..ee4dd849
--- /dev/null
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -0,0 +1,20 @@
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+
+
+@TRAINERS.register_module(module_name=Trainers.movie_scene_segmentation)
+class MovieSceneSegmentationTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index dae7117e..a9d1345d 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -62,6 +62,7 @@ class CVTasks(object):
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
     crowd_counting = 'crowd-counting'
+    movie_scene_segmentation = 'movie-scene-segmentation'
 
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index b7b3e4e8..ebb61851 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -21,6 +21,7 @@ regex
 scikit-image>=0.19.3
 scikit-learn>=0.20.1
 shapely
+shotdetect_scenedetect_lgss
 tensorflow-estimator>=1.15.1
 tf_slim
 timm>=0.4.9
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index ed07def7..9780ac4b 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -31,6 +31,12 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_seg_toydata(self):
+        ms_ds_train = MsDataset.load('movie_scene_seg_toydata', split='train')
+        print(ms_ds_train._hf_ds.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_coco(self):
         ms_ds_train = MsDataset.load(
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
new file mode 100644
index 00000000..5993c634
--- /dev/null
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MovieSceneSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_segmentation(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation, model=model_id)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_movie_scene_segmentation_with_default_task(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_movie_scene_segmentation_trainer.py b/tests/trainers/test_movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..f25dc92a
--- /dev/null
+++ b/tests/trainers/test_movie_scene_segmentation_trainer.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.movie_scene_segmentation import \
+    MovieSceneSegmentationModel
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageInstanceSegmentationTrainer(unittest.TestCase):
+
+    model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+        cache_path = snapshot_download(self.model_id)
+        config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(config_path)
+
+        max_epochs = cfg.train.max_epochs
+
+        train_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='train',
+            cfg=cfg.preprocessor,
+            test_mode=False)
+
+        test_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='test',
+            cfg=cfg.preprocessor,
+            test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            namespace=train_data_cfg.namespace,
+            cfg=train_data_cfg.cfg,
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            namespace=test_data_cfg.namespace,
+            cfg=test_data_cfg.cfg,
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+        self.max_epochs = max_epochs
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        cache_path = snapshot_download(self.model_id)
+        model = MovieSceneSegmentationModel.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From e5c9ded870846f8b9df3b3308e51b3e52f48667e Mon Sep 17 00:00:00 2001
From: "xuanjie.wxb" <xuanjie.wxb@alibaba-inc.com>
Date: Thu, 1 Sep 2022 09:19:59 +0800
Subject: [PATCH 031/175] [to #42322933] add lstm-crf ner model code        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code
---
 modelscope/metainfo.py                        |   1 +
 modelscope/models/nlp/__init__.py             |  10 +-
 .../nlp/nncrf_for_named_entity_recognition.py | 114 ++++++++++++++++--
 .../nlp/named_entity_recognition_pipeline.py  |   3 +
 modelscope/preprocessors/nlp.py               |  17 ++-
 .../test_named_entity_recognition.py          |  51 ++++++--
 6 files changed, 170 insertions(+), 26 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index f1179be8..4bb0857b 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -44,6 +44,7 @@ class Models(object):
     space_modeling = 'space-modeling'
     star = 'star'
     tcrf = 'transformer-crf'
+    lcrf = 'lstm-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
     bert_for_ds = 'bert-for-document-segmentation'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 8bf06c1d..90a37cea 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -11,7 +11,9 @@ if TYPE_CHECKING:
     from .csanmt_for_translation import CsanmtForTranslation
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM)
-    from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition
+    from .nncrf_for_named_entity_recognition import (
+        TransformerCRFForNamedEntityRecognition,
+        LSTMCRFForNamedEntityRecognition)
     from .palm_v2 import PalmForTextGeneration
     from .token_classification import SbertForTokenClassification
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
@@ -34,8 +36,10 @@ else:
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'masked_language':
         ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
-        'nncrf_for_named_entity_recognition':
-        ['TransformerCRFForNamedEntityRecognition'],
+        'nncrf_for_named_entity_recognition': [
+            'TransformerCRFForNamedEntityRecognition',
+            'LSTMCRFForNamedEntityRecognition'
+        ],
         'palm_v2': ['PalmForTextGeneration'],
         'token_classification': ['SbertForTokenClassification'],
         'sequence_classification':
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 2015997f..37216510 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -10,27 +10,25 @@ from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['TransformerCRFForNamedEntityRecognition']
+__all__ = [
+    'TransformerCRFForNamedEntityRecognition',
+    'LSTMCRFForNamedEntityRecognition'
+]
 
 
-@MODELS.register_module(
-    Tasks.named_entity_recognition, module_name=Models.tcrf)
-class TransformerCRFForNamedEntityRecognition(TorchModel):
-    """This model wraps the TransformerCRF model to register into model sets.
-    """
+class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-
-        self.config = AutoConfig.from_pretrained(model_dir)
-        num_labels = self.config.num_labels
-
-        self.model = TransformerCRF(model_dir, num_labels)
+        self.model = self.init_model(model_dir, *args, **kwargs)
 
         model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
         self.model.load_state_dict(
             torch.load(model_ckpt, map_location=torch.device('cpu')))
 
+    def init_model(self, model_dir, *args, **kwargs):
+        raise NotImplementedError
+
     def train(self):
         return self.model.train()
 
@@ -64,6 +62,39 @@ class TransformerCRFForNamedEntityRecognition(TorchModel):
         return output
 
 
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.tcrf)
+class TransformerCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the TransformerCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        num_labels = self.config.num_labels
+
+        model = TransformerCRF(model_dir, num_labels)
+        return model
+
+
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.lcrf)
+class LSTMCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the LSTMCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        vocab_size = self.config.vocab_size
+        embed_width = self.config.embed_width
+        num_labels = self.config.num_labels
+        lstm_hidden_size = self.config.lstm_hidden_size
+
+        model = LSTMCRF(vocab_size, embed_width, num_labels, lstm_hidden_size)
+        return model
+
+
 class TransformerCRF(nn.Module):
     """A transformer based model to NER tasks.
 
@@ -105,6 +136,56 @@ class TransformerCRF(nn.Module):
         return outputs
 
 
+class LSTMCRF(nn.Module):
+    """
+    A standard bilstm-crf model for fast prediction.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embed_width,
+                 num_labels,
+                 lstm_hidden_size=100,
+                 **kwargs):
+        super(LSTMCRF, self).__init__()
+        self.embedding = Embedding(vocab_size, embed_width)
+        self.lstm = nn.LSTM(
+            embed_width,
+            lstm_hidden_size,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True)
+        self.ffn = nn.Linear(lstm_hidden_size * 2, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self, inputs):
+        embedding = self.embedding(inputs['input_ids'])
+        lstm_output, _ = self.lstm(embedding)
+        logits = self.ffn(lstm_output)
+
+        if 'label_mask' in inputs:
+            mask = inputs['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        outputs = {'logits': logits}
+        return outputs
+
+    def decode(self, inputs):
+        seq_lens = inputs['label_mask'].sum(-1).long()
+        mask = torch.arange(
+            inputs['label_mask'].shape[1],
+            device=seq_lens.device)[None, :] < seq_lens[:, None]
+        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        outputs = {'predicts': predicts}
+        return outputs
+
+
 class CRF(nn.Module):
     """Conditional random field.
     This module implements a conditional random field [LMP01]_. The forward computation
@@ -547,3 +628,14 @@ class CRF(nn.Module):
 
         return torch.where(mask.unsqueeze(-1), best_tags_arr,
                            oor_tag).permute(2, 1, 0)
+
+
+class Embedding(nn.Module):
+
+    def __init__(self, vocab_size, embed_width):
+        super(Embedding, self).__init__()
+
+        self.embedding = nn.Embedding(vocab_size, embed_width)
+
+    def forward(self, input_ids):
+        return self.embedding(input_ids)
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index b0b06c88..8fbdde86 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -84,6 +84,9 @@ class NamedEntityRecognitionPipeline(Pipeline):
                     entity['span'] = text[entity['start']:entity['end']]
                     entities.append(entity)
                     entity = {}
+        if entity:
+            entity['span'] = text[entity['start']:entity['end']]
+            entities.append(entity)
         outputs = {OutputKeys.OUTPUT: entities}
 
         return outputs
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 345d3711..578bbd49 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,8 +5,7 @@ import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-import torch
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, BertTokenizerFast
 
 from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
@@ -539,8 +538,13 @@ class NERPreprocessor(Preprocessor):
 
         self.model_dir: str = model_dir
         self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=True)
+        self.is_transformer_based_model = 'lstm' not in model_dir
+        if self.is_transformer_based_model:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_dir, use_fast=True)
+        else:
+            self.tokenizer = BertTokenizerFast.from_pretrained(
+                model_dir, use_fast=True)
         self.is_split_into_words = self.tokenizer.init_kwargs.get(
             'is_split_into_words', False)
 
@@ -604,6 +608,11 @@ class NERPreprocessor(Preprocessor):
                 else:
                     label_mask.append(1)
                     offset_mapping.append(encodings['offset_mapping'][i])
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
         return {
             'text': text,
             'input_ids': input_ids,
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 5ba93f49..ad0fa228 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -3,7 +3,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition
+from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
+                                   TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessor
@@ -12,12 +13,13 @@ from modelscope.utils.test_utils import test_level
 
 
 class NamedEntityRecognitionTest(unittest.TestCase):
-    model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+    tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+    lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id)
+    def test_run_tcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.tcrf_model_id)
         tokenizer = NERPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
@@ -32,9 +34,36 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.lcrf_model_id)
+        tokenizer = NERPreprocessor(cache_path)
+        model = LSTMCRFForNamedEntityRecognition(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = NamedEntityRecognitionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
+    def test_run_tcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.tcrf_model_id)
+        tokenizer = NERPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.lcrf_model_id)
         tokenizer = NERPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
@@ -43,9 +72,15 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_model_name(self):
+    def test_run_tcrf_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.named_entity_recognition, model=self.model_id)
+            task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')

From c8b6030b8e0fc8de10e16a38412409ba67ef6bf4 Mon Sep 17 00:00:00 2001
From: "yongfei.zyf" <yongfei.zyf@alibaba-inc.com>
Date: Thu, 1 Sep 2022 14:20:04 +0800
Subject: [PATCH 032/175] [to #42322933] Add hicossl_video_embedding_pipeline
 to maas lib         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9969472

---
 modelscope/metainfo.py                        |   1 +
 .../models/cv/action_recognition/models.py    |  45 ++-
 .../models/cv/action_recognition/s3dg.py      | 301 ++++++++++++++++++
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../cv/action_recognition_pipeline.py         |   1 +
 .../cv/hicossl_video_embedding_pipeline.py    |  75 +++++
 modelscope/preprocessors/video.py             | 119 +++++--
 .../pipelines/test_hicossl_video_embedding.py |  26 ++
 8 files changed, 538 insertions(+), 32 deletions(-)
 create mode 100644 modelscope/models/cv/action_recognition/s3dg.py
 create mode 100644 modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
 create mode 100644 tests/pipelines/test_hicossl_video_embedding.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 4bb0857b..51fed99f 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -99,6 +99,7 @@ class Pipelines(object):
     animal_recognition = 'resnet101-animal-recognition'
     general_recognition = 'resnet101-general-recognition'
     cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
+    hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
     body_3d_keypoints = 'canonical_body-3d-keypoints_video'
     human_detection = 'resnet18-human-detection'
diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
index 48e75ae1..a5964e21 100644
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,5 +1,6 @@
 import torch.nn as nn
 
+from .s3dg import Inception3D
 from .tada_convnext import TadaConvNeXt
 
 
@@ -26,11 +27,25 @@ class BaseVideoModel(nn.Module):
         super(BaseVideoModel, self).__init__()
         # the backbone is created according to meta-architectures
         # defined in models/base/backbone.py
-        self.backbone = TadaConvNeXt(cfg)
+        if cfg.MODEL.NAME == 'ConvNeXt_tiny':
+            self.backbone = TadaConvNeXt(cfg)
+        elif cfg.MODEL.NAME == 'S3DG':
+            self.backbone = Inception3D(cfg)
+        else:
+            error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format(
+                cfg.MODEL.NAME)
+            raise NotImplementedError(error_str)
 
         # the head is created according to the heads
         # defined in models/module_zoo/heads
-        self.head = BaseHead(cfg)
+        if cfg.VIDEO.HEAD.NAME == 'BaseHead':
+            self.head = BaseHead(cfg)
+        elif cfg.VIDEO.HEAD.NAME == 'AvgHead':
+            self.head = AvgHead(cfg)
+        else:
+            error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format(
+                cfg.VIDEO.HEAD.NAME)
+            raise NotImplementedError(error_str)
 
     def forward(self, x):
         x = self.backbone(x)
@@ -88,3 +103,29 @@ class BaseHead(nn.Module):
         out = self.activation(out)
         out = out.view(out.shape[0], -1)
         return out, x.view(x.shape[0], -1)
+
+
+class AvgHead(nn.Module):
+    """
+    Constructs base head.
+    """
+
+    def __init__(
+        self,
+        cfg,
+    ):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(AvgHead, self).__init__()
+        self.cfg = cfg
+        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)
+
+    def forward(self, x):
+        if len(x.shape) == 5:
+            x = self.global_avg_pool(x)
+            # (N, C, T, H, W) -> (N, T, H, W, C).
+            x = x.permute((0, 2, 3, 4, 1))
+        out = x.view(x.shape[0], -1)
+        return out, x.view(x.shape[0], -1)
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
new file mode 100644
index 00000000..f258df16
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -0,0 +1,301 @@
+import torch
+import torch.nn as nn
+
+
+class InceptionBaseConv3D(nn.Module):
+    """
+    Constructs basic inception 3D conv.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(InceptionBaseConv3D, self).__init__()
+        self.conv = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = nn.BatchNorm3d(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn.weight.data.fill_(1)
+        self.bn.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class InceptionBlock3D(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg, in_planes, out_planes):
+        super(InceptionBlock3D, self).__init__()
+
+        _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING
+
+        assert len(out_planes) == 6
+        assert isinstance(out_planes, list)
+
+        [
+            num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a,
+            num_out_2_0b, num_out_3_0b
+        ] = out_planes
+
+        self.branch0 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), )
+        self.branch1 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_1_0a,
+                num_out_1_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch2 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_2_0a,
+                num_out_2_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch3 = nn.Sequential(
+            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
+        )
+
+        self.out_channels = sum(
+            [num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])
+
+        self.gating = _gating
+        if _gating:
+            self.gating_b0 = SelfGating(num_out_0_0a)
+            self.gating_b1 = SelfGating(num_out_1_0b)
+            self.gating_b2 = SelfGating(num_out_2_0b)
+            self.gating_b3 = SelfGating(num_out_3_0b)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        if self.gating:
+            x0 = self.gating_b0(x0)
+            x1 = self.gating_b1(x1)
+            x2 = self.gating_b2(x2)
+            x3 = self.gating_b3(x3)
+
+        out = torch.cat((x0, x1, x2, x3), 1)
+
+        return out
+
+
+class SelfGating(nn.Module):
+
+    def __init__(self, input_dim):
+        super(SelfGating, self).__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+
+    def forward(self, input_tensor):
+        """Feature gating as used in S3D-G"""
+        spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
+        weights = self.fc(spatiotemporal_average)
+        weights = torch.sigmoid(weights)
+        return weights[:, :, None, None, None] * input_tensor
+
+
+class STConv3d(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(STConv3d, self).__init__()
+        if isinstance(stride, tuple):
+            t_stride = stride[0]
+            stride = stride[-1]
+        else:  # int
+            t_stride = stride
+
+        self.bn_mmt = cfg.BN.MOMENTUM
+        self.bn_eps = float(cfg.BN.EPS)
+        self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride,
+                               t_stride, padding)
+
+    def _construct_branch(self,
+                          cfg,
+                          in_planes,
+                          out_planes,
+                          kernel_size,
+                          stride,
+                          t_stride,
+                          padding=0):
+        self.conv1 = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=(1, kernel_size, kernel_size),
+            stride=(1, stride, stride),
+            padding=(0, padding, padding),
+            bias=False)
+        self.conv2 = nn.Conv3d(
+            out_planes,
+            out_planes,
+            kernel_size=(kernel_size, 1, 1),
+            stride=(t_stride, 1, 1),
+            padding=(padding, 0, 0),
+            bias=False)
+
+        self.bn1 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.bn2 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv1.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.conv2.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn1.weight.data.fill_(1)
+        self.bn1.bias.data.zero_()
+        self.bn2.weight.data.fill_(1)
+        self.bn2.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        return x
+
+
+class Inception3D(nn.Module):
+    """
+    Backbone architecture for I3D/S3DG.
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(Inception3D, self).__init__()
+        _input_channel = cfg.DATA.NUM_INPUT_CHANNELS
+        self._construct_backbone(cfg, _input_channel)
+
+    def _construct_backbone(self, cfg, input_channel):
+        # ------------------- Block 1 -------------------
+        self.Conv_1a = STConv3d(
+            cfg, input_channel, 64, kernel_size=7, stride=2, padding=3)
+
+        self.block1 = nn.Sequential(self.Conv_1a)  # (64, 32, 112, 112)
+
+        # ------------------- Block 2 -------------------
+        self.MaxPool_2a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Conv_2b = InceptionBaseConv3D(
+            cfg, 64, 64, kernel_size=1, stride=1)
+        self.Conv_2c = STConv3d(
+            cfg, 64, 192, kernel_size=3, stride=1, padding=1)
+
+        self.block2 = nn.Sequential(
+            self.MaxPool_2a,  # (64, 32, 56, 56)
+            self.Conv_2b,  # (64, 32, 56, 56)
+            self.Conv_2c)  # (192, 32, 56, 56)
+
+        # ------------------- Block 3 -------------------
+        self.MaxPool_3a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Mixed_3b = InceptionBlock3D(
+            cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32])
+        self.Mixed_3c = InceptionBlock3D(
+            cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64])
+
+        self.block3 = nn.Sequential(
+            self.MaxPool_3a,  # (192, 32, 28, 28)
+            self.Mixed_3b,  # (256, 32, 28, 28)
+            self.Mixed_3c)  # (480, 32, 28, 28)
+
+        # ------------------- Block 4 -------------------
+        self.MaxPool_4a = nn.MaxPool3d(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
+        self.Mixed_4b = InceptionBlock3D(
+            cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64])
+        self.Mixed_4c = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64])
+        self.Mixed_4d = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64])
+        self.Mixed_4e = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64])
+        self.Mixed_4f = InceptionBlock3D(
+            cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128])
+
+        self.block4 = nn.Sequential(
+            self.MaxPool_4a,  # (480, 16, 14, 14)
+            self.Mixed_4b,  # (512, 16, 14, 14)
+            self.Mixed_4c,  # (512, 16, 14, 14)
+            self.Mixed_4d,  # (512, 16, 14, 14)
+            self.Mixed_4e,  # (528, 16, 14, 14)
+            self.Mixed_4f)  # (832, 16, 14, 14)
+
+        # ------------------- Block 5 -------------------
+        self.MaxPool_5a = nn.MaxPool3d(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
+        self.Mixed_5b = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128])
+        self.Mixed_5c = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128])
+
+        self.block5 = nn.Sequential(
+            self.MaxPool_5a,  # (832, 8, 7, 7)
+            self.Mixed_5b,  # (832, 8, 7, 7)
+            self.Mixed_5c)  # (1024, 8, 7, 7)
+
+    def forward(self, x):
+        if isinstance(x, dict):
+            x = x['video']
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        return x
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index bd175578..01c69758 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
+    from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
     from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
@@ -51,6 +52,7 @@ else:
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
+        'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
         'image_detection_pipeline': ['ImageDetectionPipeline'],
         'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index 087548f0..e3400ea7 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -33,6 +33,7 @@ class ActionRecognitionPipeline(Pipeline):
         config_path = osp.join(self.model, ModelFile.CONFIGURATION)
         logger.info(f'loading config from {config_path}')
         self.cfg = Config.from_file(config_path)
+
         self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
         self.infer_model.eval()
         self.infer_model.load_state_dict(
diff --git a/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
new file mode 100644
index 00000000..5e4cd4c6
--- /dev/null
+++ b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
@@ -0,0 +1,75 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_recognition import BaseVideoModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import ReadVideoData
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_embedding, module_name=Pipelines.hicossl_video_embedding)
+class HICOSSLVideoEmbeddingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a hicossl video embedding pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device)['model_state'],
+            strict=False)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(
+                self.cfg, input, num_temporal_views_override=1).to(self.device)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        feature = self.perform_inference(input['video_data'])
+        return {OutputKeys.VIDEO_EMBEDDING: feature.data.cpu().numpy()}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        """ Perform feature extracting for a given video
+        Args:
+            model (BaseVideoModel): video model with loadded state dict.
+            max_bsz (int): the maximum batch size, limited by GPU memory.
+        Returns:
+            pred (Tensor): the extracted features for input video clips.
+        """
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz])[0])
+        pred = torch.cat(preds_list, dim=0)
+        return pred
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index 0d2e8c3e..f693cd9e 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -16,34 +16,49 @@ from .base import Preprocessor
 from .builder import PREPROCESSORS
 
 
-def ReadVideoData(cfg, video_path):
+def ReadVideoData(cfg,
+                  video_path,
+                  num_spatial_crops_override=None,
+                  num_temporal_views_override=None):
     """ simple interface to load video frames from file
 
     Args:
         cfg (Config): The global config object.
         video_path (str): video file path
+        num_spatial_crops_override (int): the spatial crops per clip
+        num_temporal_views_override (int): the temporal clips per video
+    Returns:
+        data (Tensor): the normalized video clips for model inputs
     """
-    data = _decode_video(cfg, video_path)
-    transform = kinetics400_tranform(cfg)
+    data = _decode_video(cfg, video_path, num_temporal_views_override)
+    if num_spatial_crops_override is not None:
+        num_spatial_crops = num_spatial_crops_override
+        transform = kinetics400_tranform(cfg, num_spatial_crops_override)
+    else:
+        num_spatial_crops = cfg.TEST.NUM_SPATIAL_CROPS
+        transform = kinetics400_tranform(cfg, cfg.TEST.NUM_SPATIAL_CROPS)
     data_list = []
     for i in range(data.size(0)):
-        for j in range(cfg.TEST.NUM_SPATIAL_CROPS):
+        for j in range(num_spatial_crops):
             transform.transforms[1].set_spatial_index(j)
             data_list.append(transform(data[i]))
     return torch.stack(data_list, dim=0)
 
 
-def kinetics400_tranform(cfg):
+def kinetics400_tranform(cfg, num_spatial_crops):
     """
     Configs the transform for the kinetics-400 dataset.
     We apply controlled spatial cropping and normalization.
     Args:
         cfg (Config): The global config object.
+        num_spatial_crops (int): the spatial crops per clip
+    Returns:
+        transform_function (Compose): the transform function for input clips
     """
     resize_video = KineticsResizedCrop(
         short_side_range=[cfg.DATA.TEST_SCALE, cfg.DATA.TEST_SCALE],
         crop_size=cfg.DATA.TEST_CROP_SIZE,
-        num_spatial_crops=cfg.TEST.NUM_SPATIAL_CROPS)
+        num_spatial_crops=num_spatial_crops)
     std_transform_list = [
         transforms.ToTensorVideo(), resize_video,
         transforms.NormalizeVideo(
@@ -60,17 +75,17 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
             vid_length  (int): the length of the whole video (valid selection range).
             vid_fps     (int): the original video fps
             target_fps  (int): the normalized video fps
-            clip_idx    (int): -1 for random temporal sampling, and positive values for
-                                sampling specific clip from the video
+            clip_idx    (int): -1 for random temporal sampling, and positive values for sampling specific
+                                clip from the video
             num_clips   (int): the total clips to be sampled from each video.
-                                combined with clip_idx, the sampled video is the "clip_idx-th"
-                                 video from "num_clips" videos.
+                                combined with clip_idx, the sampled video is the "clip_idx-th" video from
+                                "num_clips" videos.
             num_frames  (int): number of frames in each sampled clips.
             interval    (int): the interval to sample each frame.
             minus_interval (bool): control the end index
         Returns:
             index (tensor): the sampled frame indexes
-        """
+    """
     if num_frames == 1:
         index = [random.randint(0, vid_length - 1)]
     else:
@@ -78,7 +93,10 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
         clip_length = num_frames * interval * vid_fps / target_fps
 
         max_idx = max(vid_length - clip_length, 0)
-        start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
+        if num_clips == 1:
+            start_idx = max_idx / 2
+        else:
+            start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
         if minus_interval:
             end_idx = start_idx + clip_length - interval
         else:
@@ -90,59 +108,79 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
     return index
 
 
-def _decode_video_frames_list(cfg, frames_list, vid_fps):
+def _decode_video_frames_list(cfg,
+                              frames_list,
+                              vid_fps,
+                              num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
             cfg          (Config): The global config object.
             frames_list  (list):  all frames for a video, the frames should be numpy array.
             vid_fps      (int):  the fps of this video.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     assert isinstance(frames_list, list)
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(frames_list), vid_fps, cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(frames_list),
+            vid_fps,
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         frames = torch.from_numpy(
-            np.stack([frames_list[l_index] for l_index in list_.tolist()],
-                     axis=0))
+            np.stack([frames_list[index] for index in list_.tolist()], axis=0))
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
-
+    del vr
     return frames
 
 
-def _decode_video(cfg, path):
+def _decode_video(cfg, path, num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
+            cfg          (Config): The global config object.
             path          (str): video file path.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     vr = VideoReader(path)
-
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(vr), vr.get_avg_fps(), cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(vr),
+            vr.get_avg_fps(),
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         if path.endswith('.avi'):
             append_list = torch.arange(0, list_[0], 4)
@@ -155,8 +193,6 @@ def _decode_video(cfg, path):
                 vr.get_batch(list_).to_dlpack()).clone()
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
     del vr
     return frames
 
@@ -224,6 +260,29 @@ class KineticsResizedCrop(object):
                     y = y_max // 2
         return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
 
+    def _get_random_crop(self, clip):
+        _, _, clip_height, clip_width = clip.shape
+
+        short_side = min(clip_height, clip_width)
+        long_side = max(clip_height, clip_width)
+        new_short_side = int(random.uniform(*self.short_side_range))
+        new_long_side = int(long_side / short_side * new_short_side)
+        if clip_height < clip_width:
+            new_clip_height = new_short_side
+            new_clip_width = new_long_side
+        else:
+            new_clip_height = new_long_side
+            new_clip_width = new_short_side
+
+        new_clip = torch.nn.functional.interpolate(
+            clip, size=(new_clip_height, new_clip_width), mode='bilinear')
+
+        x_max = int(new_clip_width - self.crop_size)
+        y_max = int(new_clip_height - self.crop_size)
+        x = int(random.uniform(0, x_max))
+        y = int(random.uniform(0, y_max))
+        return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
+
     def set_spatial_index(self, idx):
         """Set the spatial cropping index for controlled cropping..
         Args:
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
new file mode 100644
index 00000000..5615cef2
--- /dev/null
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# !/usr/bin/env python
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HICOSSLVideoEmbeddingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_s3dg_video-embedding'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        videossl_pipeline = pipeline(
+            Tasks.video_embedding, model=self.model_id)
+        result = videossl_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+
+        print(f'video embedding output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3f972785648283c4c86f20806b270b28eb3149de Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Thu, 1 Sep 2022 15:26:45 +0800
Subject: [PATCH 033/175] =?UTF-8?q?[to=20#42322933]=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86=E6=96=AD=E7=82=B9=E7=BB=AD=E4=BC=A0=E4=B8=8B=E8=BD=BD?=
 =?UTF-8?q?+=E4=BF=AE=E5=A4=8D=E6=95=B0=E6=8D=AE=E9=9B=86=E5=91=BD?=
 =?UTF-8?q?=E5=90=8D=E5=AD=98=E5=9C=A8=E5=A4=A7=E5=86=99=E5=AD=97=E6=AF=8D?=
 =?UTF-8?q?=E5=AF=BC=E8=87=B4=E5=8A=A0=E8=BD=BD=E5=A4=B1=E8=B4=A5=E7=9A=84?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98=20=20=20=20=20=20=20=20=20Link:=20https://co?=
 =?UTF-8?q?de.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9973942?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix msdataset dataset name
* add resume download
---
 modelscope/msdatasets/utils/dataset_builder.py | 8 +++++---
 modelscope/msdatasets/utils/oss_utils.py       | 8 ++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 7180cb5b..825400c4 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -5,6 +5,7 @@ import datasets
 import pandas as pd
 import pyarrow as pa
 from datasets.info import DatasetInfo
+from datasets.naming import camelcase_to_snakecase
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock
 
@@ -34,8 +35,8 @@ class MsCsvDatasetBuilder(csv.Csv):
             data_files=meta_data_files,
             **config_kwargs)
 
-        self.name = dataset_name
-        self.info.builder_name = self.name
+        self.name = camelcase_to_snakecase(dataset_name)
+        self.info.builder_name = dataset_name
         self._cache_dir = self._build_cache_dir(namespace=namespace)
         lock_path = os.path.join(
             self._cache_dir_root,
@@ -65,7 +66,7 @@ class MsCsvDatasetBuilder(csv.Csv):
         or if a namespace has been specified:
             self.namespace___self.name/self.config.version/self.hash/
         """
-        builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}'
+        builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
         builder_config = self.config
         hash = self.hash
         if builder_config:
@@ -156,6 +157,7 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self.zip_data_files = zip_data_files
         self.split_path_dict = None
         self.config = None
+        self.info = DatasetInfo.from_dict({'builder_name': dataset_name})
         self._cache_dir_root = os.path.expanduser(cache_dir)
         self._cache_dir = self._build_cache_dir()
         self._config_kwargs = config_kwargs
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 82d43bef..63a1cf77 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -34,8 +34,12 @@ class OssUtilities:
         local_path = os.path.join(cache_dir, filename)
 
         if download_config.force_download or not os.path.exists(local_path):
-            self.bucket.get_object_to_file(
-                file_oss_key, local_path, progress_callback=self._percentage)
+            oss2.resumable_download(
+                self.bucket,
+                file_oss_key,
+                local_path,
+                multiget_threshold=0,
+                progress_callback=self._percentage)
         return local_path
 
     def upload(self, oss_file_name: str, local_file_path: str) -> str:

From ce41ded4237bc7e2279cf4aff10fa3a21dd1c075 Mon Sep 17 00:00:00 2001
From: "pengyu.lpy" <pengyu.lpy@alibaba-inc.com>
Date: Thu, 1 Sep 2022 15:48:40 +0800
Subject: [PATCH 034/175] [to #42322933]modify test_segmentation_pipeline.py
 for damo models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

基于easycv上线的segformer，对应上传了5个对应的达摩院的分割模型，所以修正了tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py内容让其能够便利测试
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9934634
---
 .../test_segmentation_pipeline.py             | 48 +++++++++++++++----
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index 0eca2a7f..6cfdacc6 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -12,24 +12,54 @@ from modelscope.utils.test_utils import test_level
 
 class EasyCVSegmentationPipelineTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_segformer_b0(self):
-        img_path = 'data/test/images/image_segmentation.jpg'
-        model_id = 'EasyCV/EasyCV-Segformer-b0'
-        img = np.asarray(Image.open(img_path))
+    img_path = 'data/test/images/image_segmentation.jpg'
+
+    def _internal_test__(self, model_id):
+        img = np.asarray(Image.open(self.img_path))
+
+        semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
+        outputs = semantic_seg(self.img_path)
 
-        object_detect = pipeline(task=Tasks.image_segmentation, model=model_id)
-        outputs = object_detect(img_path)
         self.assertEqual(len(outputs), 1)
 
         results = outputs[0]
         self.assertListEqual(
             list(img.shape)[:2], list(results['seg_pred'][0].shape))
-        self.assertListEqual(results['seg_pred'][0][1, :10].tolist(),
-                             [161 for i in range(10)])
+        self.assertListEqual(results['seg_pred'][0][1, 4:10].tolist(),
+                             [161 for i in range(6)])
         self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
                              [133 for i in range(10)])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b0(self):
+        model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b1(self):
+        model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b2(self):
+        model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b3(self):
+        model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b4(self):
+        model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b5(self):
+        model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
 
 if __name__ == '__main__':
     unittest.main()

From 8f81807537c9df1a6967c8639481b7ab85504685 Mon Sep 17 00:00:00 2001
From: "fubang.zfb" <fubang.zfb@alibaba-inc.com>
Date: Thu, 1 Sep 2022 16:31:51 +0800
Subject: [PATCH 035/175] =?UTF-8?q?[to=20#42322933]=20=E5=85=B3=E7=B3=BB?=
 =?UTF-8?q?=E6=8A=BD=E5=8F=96=20=20=20=20=20=20=20=20=20Link:=20https://co?=
 =?UTF-8?q?de.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9938140?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |   4 +
 modelscope/models/nlp/__init__.py             |  11 +-
 .../nlp/heads/infromation_extraction_head.py  | 106 ++++++++++++++++++
 modelscope/models/nlp/task_models/__init__.py |  26 +++++
 .../nlp/task_models/information_extraction.py |  49 ++++++++
 modelscope/outputs.py                         |   3 +-
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/nlp/__init__.py          |   6 +-
 .../nlp/information_extraction_pipeline.py    |  42 +++++++
 modelscope/preprocessors/__init__.py          |   6 +-
 modelscope/preprocessors/nlp.py               |  49 +++++++-
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_relation_extraction.py   |  57 ++++++++++
 13 files changed, 354 insertions(+), 9 deletions(-)
 create mode 100644 modelscope/models/nlp/heads/infromation_extraction_head.py
 create mode 100644 modelscope/models/nlp/task_models/information_extraction.py
 create mode 100644 modelscope/pipelines/nlp/information_extraction_pipeline.py
 create mode 100644 tests/pipelines/test_relation_extraction.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 51fed99f..6f34b1a3 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -69,6 +69,7 @@ class Models(object):
 class TaskModels(object):
     # nlp task
     text_classification = 'text-classification'
+    information_extraction = 'information-extraction'
 
 
 class Heads(object):
@@ -78,6 +79,7 @@ class Heads(object):
     bert_mlm = 'bert-mlm'
     # roberta mlm
     roberta_mlm = 'roberta-mlm'
+    information_extraction = 'information-extraction'
 
 
 class Pipelines(object):
@@ -156,6 +158,7 @@ class Pipelines(object):
     text_error_correction = 'text-error-correction'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
 
     # audio tasks
@@ -248,6 +251,7 @@ class Preprocessors(object):
     fill_mask = 'fill-mask'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
 
     # audio preprocessor
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 90a37cea..e17a1d31 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -21,7 +21,9 @@ if TYPE_CHECKING:
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
     from .star_text_to_sql import StarForTextToSql
-    from .task_models.task_model import SingleBackboneTaskModelBase
+    from .task_models import (InformationExtractionModel,
+                              SequenceClassificationModel,
+                              SingleBackboneTaskModelBase)
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
     from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
@@ -48,10 +50,13 @@ else:
             'SpaceForDialogIntent', 'SpaceForDialogModeling',
             'SpaceForDialogStateTracking'
         ],
-        'task_model': ['SingleBackboneTaskModelBase'],
+        'task_models': [
+            'InformationExtractionModel', 'SequenceClassificationModel',
+            'SingleBackboneTaskModelBase'
+        ],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'gpt3': ['GPT3ForTextGeneration'],
-        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering']
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
new file mode 100644
index 00000000..cf957834
--- /dev/null
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -0,0 +1,106 @@
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.information_extraction, module_name=Heads.information_extraction)
+class InformationExtractionHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        assert config.get('labels') is not None
+        self.labels = config.labels
+        self.s_layer = nn.Linear(config.hidden_size, 2)  # head, tail, bce
+        self.o_layer = nn.Linear(2 * config.hidden_size, 2)  # head, tail, bce
+        self.p_layer = nn.Linear(config.hidden_size,
+                                 len(self.labels))  # label, ce
+        self.mha = nn.MultiheadAttention(config.hidden_size, 4)
+
+    def forward(self, sequence_output, text, offsets, threshold=0.5):
+        # assert batch size == 1
+        spos = []
+        s_head_logits, s_tail_logits = self.s_layer(sequence_output).split(
+            1, dim=-1)  # (b, seq_len, 2)
+        s_head_logits = s_head_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_tail_logits = s_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_masks, subjects = self._get_masks_and_mentions(
+            text, offsets, s_head_logits, s_tail_logits, None, threshold)
+        for s_mask, subject in zip(s_masks, subjects):
+            masked_sequence_output = sequence_output * s_mask.unsqueeze(
+                0).unsqueeze(-1)  # (b, s, h)
+            subjected_sequence_output = self.mha(
+                sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0,
+                                               2))[0].permute(1, 0,
+                                                              2)  # (b, s, h)
+            cat_sequence_output = torch.cat(
+                (sequence_output, subjected_sequence_output), dim=-1)
+            o_head_logits, o_tail_logits = self.o_layer(
+                cat_sequence_output).split(
+                    1, dim=-1)
+            o_head_logits = o_head_logits[0, :, 0].sigmoid()  # (seq_len)
+            o_tail_logits = o_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+            so_masks, objects = self._get_masks_and_mentions(
+                text, offsets, o_head_logits, o_tail_logits, s_mask, threshold)
+            for so_mask, object in zip(so_masks, objects):
+                masked_sequence_output = (
+                    sequence_output * so_mask.unsqueeze(0).unsqueeze(-1)).sum(
+                        1)  # (b, h)
+                lengths = so_mask.unsqueeze(0).sum(-1, keepdim=True)  # (b, 1)
+                pooled_subject_object = masked_sequence_output / lengths  # (b, h)
+                label = self.p_layer(pooled_subject_object).sigmoid().squeeze(
+                    0)
+                for i in range(label.size(-1)):
+                    if label[i] > threshold:
+                        predicate = self.labels[i]
+                        spos.append((subject, predicate, object))
+        return spos
+
+    def _get_masks_and_mentions(self,
+                                text,
+                                offsets,
+                                heads,
+                                tails,
+                                init_mask=None,
+                                threshold=0.5):
+        '''
+        text: str
+        heads: tensor (len(heads))
+        tails: tensor (len(tails))
+        '''
+        seq_len = heads.size(-1)
+        potential_heads = []
+        for i in range(seq_len - 1):
+            if heads[i] > threshold:
+                potential_heads.append(i)
+        potential_heads.append(seq_len - 1)
+        masks = []
+        mentions = []
+        for i in range(len(potential_heads) - 1):
+            head_index = potential_heads[i]
+            tail_index, max_val = None, 0
+            for j in range(head_index, potential_heads[i + 1]):
+                if tails[j] > max_val and tails[j] > threshold:
+                    tail_index = j
+                    max_val = tails[j]
+            if tail_index is not None:
+                mask = torch.zeros_like(
+                    heads) if init_mask is None else init_mask.clone()
+                mask[head_index:tail_index + 1] = 1
+                masks.append(mask)  # (seq_len)
+                char_head = offsets[head_index][0]
+                char_tail = offsets[tail_index][1]
+                mention = text[char_head:char_tail]
+                mentions.append(mention)
+        return masks, mentions
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index e69de29b..49cf0ee4 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .information_extraction import InformationExtractionModel
+    from .sequence_classification import SequenceClassificationModel
+    from .task_model import SingleBackboneTaskModelBase
+
+else:
+    _import_structure = {
+        'information_extraction': ['InformationExtractionModel'],
+        'sequence_classification': ['SequenceClassificationModel'],
+        'task_model': ['SingleBackboneTaskModelBase'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
new file mode 100644
index 00000000..20a44787
--- /dev/null
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -0,0 +1,49 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+
+__all__ = ['InformationExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.information_extraction,
+    module_name=TaskModels.information_extraction)
+class InformationExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the information extraction model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output, input['text'],
+                                    input['offsets'])
+        return {OutputKeys.SPO_LIST: outputs}
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        return sequence_output, pooled_output
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 7c0e08dc..aebb9138 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -302,8 +302,7 @@ TASK_OUTPUTS = {
     #     "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲，石顺义作词，王锡仁作曲，范琳琳演唱",
     #     "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
     # }
-    Tasks.relation_extraction:
-    [OutputKeys.UUID, OutputKeys.TEXT, OutputKeys.SPO_LIST],
+    Tasks.relation_extraction: [OutputKeys.SPO_LIST],
 
     # translation result for a source sentence
     #   {
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 943578fb..8a1a3646 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -23,6 +23,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
+    Tasks.information_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
     Tasks.sentence_similarity:
     (Pipelines.sentence_similarity,
      'damo/nlp_structbert_sentence-similarity_chinese-base'),
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 2dd5bf62..665e016d 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .fill_mask_pipeline import FillMaskPipeline
+    from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
     from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
@@ -22,6 +23,7 @@ if TYPE_CHECKING:
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
+    from .relation_extraction_pipeline import RelationExtractionPipeline
 
 else:
     _import_structure = {
@@ -33,6 +35,7 @@ else:
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
+        'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'single_sentence_classification_pipeline':
         ['SingleSentenceClassificationPipeline'],
         'pair_sentence_classification_pipeline':
@@ -48,7 +51,8 @@ else:
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
-        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline']
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
+        'relation_extraction_pipeline': ['RelationExtractionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
new file mode 100644
index 00000000..4cb138d6
--- /dev/null
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      RelationExtractionPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['InformationExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.information_extraction, module_name=Pipelines.relation_extraction)
+class InformationExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = RelationExtractionPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        return inputs
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index d365b6fa..9f7d595e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -22,7 +22,8 @@ if TYPE_CHECKING:
                       PairSentenceClassificationPreprocessor,
                       FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
                       NERPreprocessor, TextErrorCorrectionPreprocessor,
-                      FaqQuestionAnsweringPreprocessor)
+                      FaqQuestionAnsweringPreprocessor,
+                      RelationExtractionPreprocessor)
     from .slp import DocumentSegmentationPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -51,7 +52,8 @@ else:
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
             'TextErrorCorrectionPreprocessor',
-            'FaqQuestionAnsweringPreprocessor'
+            'FaqQuestionAnsweringPreprocessor',
+            'RelationExtractionPreprocessor'
         ],
         'slp': ['DocumentSegmentationPreprocessor'],
         'space': [
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 578bbd49..4882c477 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -22,7 +22,8 @@ __all__ = [
     'PairSentenceClassificationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor'
+    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
+    'RelationExtractionPreprocessor'
 ]
 
 
@@ -622,6 +623,52 @@ class NERPreprocessor(Preprocessor):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in normal RE task.
+
+    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_error_correction)
 class TextErrorCorrectionPreprocessor(Preprocessor):
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index a9d1345d..960e9600 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -99,6 +99,7 @@ class NLPTasks(object):
     text_error_correction = 'text-error-correction'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    information_extraction = 'information-extraction'
     document_segmentation = 'document-segmentation'
 
 
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
new file mode 100644
index 00000000..20502a19
--- /dev/null
+++ b/tests/pipelines/test_relation_extraction.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import InformationExtractionModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import InformationExtractionPipeline
+from modelscope.preprocessors import RelationExtractionPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class RelationExtractionTest(unittest.TestCase):
+    model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+    sentence = '高捷，祖籍江苏，本科毕业于东南大学'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(cache_path)
+        model = InformationExtractionModel.from_pretrained(cache_path)
+        pipeline1 = InformationExtractionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.information_extraction, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.information_extraction)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 291f8fe68c3462abc6462c5e408e7f349203f630 Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Thu, 1 Sep 2022 18:14:37 +0800
Subject: [PATCH 036/175] [to #42322933] Add action-detection model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加新的action-detection task
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9898947
---
 .../videos/action_detection_test_video.mp4    |   3 +
 modelscope/metainfo.py                        |   1 +
 .../models/cv/action_detection/__init__.py    |  21 +++
 .../action_detection/action_detection_onnx.py | 177 ++++++++++++++++++
 modelscope/outputs.py                         |  15 ++
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../pipelines/cv/action_detection_pipeline.py |  63 +++++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_action_detection.py      |  22 +++
 10 files changed, 307 insertions(+)
 create mode 100644 data/test/videos/action_detection_test_video.mp4
 create mode 100644 modelscope/models/cv/action_detection/__init__.py
 create mode 100644 modelscope/models/cv/action_detection/action_detection_onnx.py
 create mode 100644 modelscope/pipelines/cv/action_detection_pipeline.py
 create mode 100644 tests/pipelines/test_action_detection.py

diff --git a/data/test/videos/action_detection_test_video.mp4 b/data/test/videos/action_detection_test_video.mp4
new file mode 100644
index 00000000..e2ea1d80
--- /dev/null
+++ b/data/test/videos/action_detection_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92
+size 3191059
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 6f34b1a3..7c5afe80 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -133,6 +133,7 @@ class Pipelines(object):
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
     crowd_counting = 'hrnet-crowd-counting'
+    action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
     image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
diff --git a/modelscope/models/cv/action_detection/__init__.py b/modelscope/models/cv/action_detection/__init__.py
new file mode 100644
index 00000000..fedbe19c
--- /dev/null
+++ b/modelscope/models/cv/action_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .action_detection_onnx import ActionDetONNX
+
+else:
+    _import_structure = {'action_detection_onnx': ['ActionDetONNX']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
new file mode 100644
index 00000000..3c171473
--- /dev/null
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -0,0 +1,177 @@
+import os
+import os.path as osp
+import shutil
+import subprocess
+
+import cv2
+import numpy as np
+import onnxruntime as rt
+
+from modelscope.models import Model
+from modelscope.utils.constant import Devices
+from modelscope.utils.device import verify_device
+
+
+class ActionDetONNX(Model):
+
+    def __init__(self, model_dir, config, *args, **kwargs):
+        super().__init__(self, model_dir, *args, **kwargs)
+        model_file = osp.join(config['model_file'])
+        device_type, device_id = verify_device(self._device_name)
+        options = rt.SessionOptions()
+        options.intra_op_num_threads = 1
+        options.inter_op_num_threads = 1
+        if device_type == Devices.gpu:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CUDAExecutionProvider'],
+                sess_options=options,
+                provider_options=[{
+                    'device_id': device_id
+                }])
+        else:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CPUExecutionProvider'],
+                sess_options=options)
+        self.input_name = sess.get_inputs()[0].name
+        self.sess = sess
+        self.num_stride = len(config['fpn_strides'])
+        self.score_thresh = np.asarray(
+            config['pre_nms_thresh'], dtype='float32').reshape((1, -1))
+        self.size_divisibility = config['size_divisibility']
+        self.nms_threshold = config['nms_thresh']
+        self.tmp_dir = config['tmp_dir']
+        self.temporal_stride = config['step']
+        self.input_data_type = config['input_type']
+        self.action_names = config['action_names']
+        self.video_length_limit = config['video_length_limit']
+
+    def resize_box(self, det, height, width, scale_h, scale_w):
+        bboxs = det[0]
+        bboxs[:, [0, 2]] *= scale_w
+        bboxs[:, [1, 3]] *= scale_h
+        bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1)
+        bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1)
+        result = {
+            'boxes': bboxs.round().astype('int32').tolist(),
+            'scores': det[1].tolist(),
+            'labels': [self.action_names[i] for i in det[2].tolist()]
+        }
+        return result
+
+    def parse_frames(self, frame_names):
+        imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names]
+        imgs = np.stack(imgs).astype(self.input_data_type).transpose(
+            (3, 0, 1, 2))  # c,t,h,w
+        imgs = imgs[None]
+        return imgs
+
+    def forward_img(self, imgs, h, w):
+        pred = self.sess.run(None, {
+            self.input_name: imgs,
+            'height': np.asarray(h),
+            'width': np.asarray(w)
+        })
+        dets = self.post_nms(
+            pred,
+            score_threshold=self.score_thresh,
+            nms_threshold=self.nms_threshold)
+        return dets
+
+    def forward_video(self, video_name, scale):
+        min_size, max_size = self._get_sizes(scale)
+
+        tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
+        if osp.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.makedirs(tmp_dir)
+        frame_rate = 2
+        cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
+              f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'
+
+        cmd = cmd.split(' ')
+        subprocess.call(cmd)
+
+        frame_names = [
+            osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
+            if name.endswith('.jpg')
+        ]
+        frame_names = [
+            frame_names[i:i + frame_rate * 2]
+            for i in range(0,
+                           len(frame_names) - frame_rate * 2 + 1, frame_rate
+                           * self.temporal_stride)
+        ]
+        timestamp = list(
+            range(1,
+                  len(frame_names) * self.temporal_stride,
+                  self.temporal_stride))
+        batch_imgs = [self.parse_frames(names) for names in frame_names]
+
+        N, _, T, H, W = batch_imgs[0].shape
+        scale_min = min_size / min(H, W)
+        h, w = min(int(scale_min * H),
+                   max_size), min(int(scale_min * W), max_size)
+        h = round(h / self.size_divisibility) * self.size_divisibility
+        w = round(w / self.size_divisibility) * self.size_divisibility
+        scale_h, scale_w = H / h, W / w
+
+        results = []
+        for imgs in batch_imgs:
+            det = self.forward_img(imgs, h, w)
+            det = self.resize_box(det[0], H, W, scale_h, scale_w)
+            results.append(det)
+        results = [{
+            'timestamp': t,
+            'actions': res
+        } for t, res in zip(timestamp, results)]
+        shutil.rmtree(tmp_dir)
+        return results
+
+    def forward(self, video_name):
+        return self.forward_video(video_name, scale=1)
+
+    def post_nms(self, pred, score_threshold, nms_threshold=0.3):
+        pred_bboxes, pred_scores = pred
+        N = len(pred_bboxes)
+        dets = []
+        for i in range(N):
+            bboxes, scores = pred_bboxes[i], pred_scores[i]
+            candidate_inds = scores > score_threshold
+            scores = scores[candidate_inds]
+            candidate_nonzeros = candidate_inds.nonzero()
+            bboxes = bboxes[candidate_nonzeros[0]]
+            labels = candidate_nonzeros[1]
+            keep = self._nms(bboxes, scores, labels, nms_threshold)
+            bbox = bboxes[keep]
+            score = scores[keep]
+            label = labels[keep]
+            dets.append((bbox, score, label))
+        return dets
+
+    def _nms(self, boxes, scores, idxs, nms_threshold):
+        if len(boxes) == 0:
+            return []
+        max_coordinate = boxes.max()
+        offsets = idxs * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None].astype('float32')
+        boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0]
+        boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1]
+        keep = cv2.dnn.NMSBoxes(
+            boxes_for_nms.tolist(),
+            scores.tolist(),
+            score_threshold=0,
+            nms_threshold=nms_threshold)
+        if len(keep.shape) == 2:
+            keep = np.squeeze(keep, 1)
+        return keep
+
+    def _get_sizes(self, scale):
+        if scale == 1:
+            min_size, max_size = 512, 896
+        elif scale == 2:
+            min_size, max_size = 768, 1280
+        else:
+            min_size, max_size = 1024, 1792
+        return min_size, max_size
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index aebb9138..7d6cdb59 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,7 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
     SPLIT_META_DICT = 'split_meta_dict'
 
@@ -541,6 +542,19 @@ TASK_OUTPUTS = {
     # }
     Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
 
+    # {
+    #     'labels': ['吸烟', '打电话', '吸烟'],
+    #     'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
+    #     'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
+    #     'timestamps': [1, 3, 5]
+    # }
+    Tasks.action_detection: [
+        OutputKeys.TIMESTAMPS,
+        OutputKeys.LABELS,
+        OutputKeys.SCORES,
+        OutputKeys.BOXES,
+    ],
+
     # {
     #   'output': [
     #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
@@ -551,6 +565,7 @@ TASK_OUTPUTS = {
     #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
     # }
     Tasks.faq_question_answering: [OutputKeys.OUTPUT],
+
     # image person reid result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 8a1a3646..c9f0c252 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
+    Tasks.action_detection: (Pipelines.action_detection,
+                             'damo/cv_ResNetC3D_action-detection_detection2d'),
     Tasks.live_category: (Pipelines.live_category,
                           'damo/cv_resnet50_live-category'),
     Tasks.video_category: (Pipelines.video_category,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 01c69758..f4e6792b 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -5,6 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .action_recognition_pipeline import ActionRecognitionPipeline
+    from .action_detection_pipeline import ActionDetectionPipeline
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
+        'action_detection_pipeline': ['ActionDetectionPipeline'],
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py
new file mode 100644
index 00000000..72335d5b
--- /dev/null
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -0,0 +1,63 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_detection import ActionDetONNX
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.action_detection, module_name=Pipelines.action_detection)
+class ActionDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a action detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.cfg.MODEL.model_file = model_path
+        self.model = ActionDetONNX(self.model, self.cfg.MODEL,
+                                   self.device_name)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_name = input
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_name': video_name}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        preds = self.model.forward(input['video_name'])
+        labels = sum([pred['actions']['labels'] for pred in preds], [])
+        scores = sum([pred['actions']['scores'] for pred in preds], [])
+        boxes = sum([pred['actions']['boxes'] for pred in preds], [])
+        timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels'])
+                          for pred in preds], [])
+        out = {
+            OutputKeys.TIMESTAMPS: timestamps,
+            OutputKeys.LABELS: labels,
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: boxes
+        }
+        return out
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 960e9600..2265ef5a 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -58,6 +58,7 @@ class CVTasks(object):
     # video recognition
     live_category = 'live-category'
     action_recognition = 'action-recognition'
+    action_detection = 'action-detection'
     video_category = 'video-category'
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py
new file mode 100644
index 00000000..c752dc78
--- /dev/null
+++ b/tests/pipelines/test_action_detection.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ActionDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        action_detection_pipline = pipeline(
+            Tasks.action_detection,
+            model='damo/cv_ResNetC3D_action-detection_detection2d')
+        result = action_detection_pipline(
+            'data/test/videos/action_detection_test_video.mp4')
+        print('action detection results:', result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f5fb8cf5318f3dfb0015484557dd0e03b9c42a8b Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 1 Sep 2022 18:56:51 +0800
Subject: [PATCH 037/175] [to #42322933] fix bug about loading new trained
 model and update doc string         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9987197

---
 modelscope/models/audio/ans/__init__.py       |  4 +-
 modelscope/models/audio/ans/complex_nn.py     |  6 ++
 modelscope/models/audio/ans/conv_stft.py      |  1 +
 modelscope/models/audio/ans/frcrn.py          | 62 +++----------------
 .../models/audio/ans/se_module_complex.py     |  1 +
 modelscope/models/audio/ans/unet.py           |  4 ++
 modelscope/trainers/audio/ans_trainer.py      |  7 +--
 modelscope/utils/audio/audio_utils.py         | 18 +++---
 8 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py
index b602ad01..afcdf314 100644
--- a/modelscope/models/audio/ans/__init__.py
+++ b/modelscope/models/audio/ans/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .frcrn import FRCRNModel
+    from .frcrn import FRCRNDecorator
 
 else:
     _import_structure = {
-        'frcrn': ['FRCRNModel'],
+        'frcrn': ['FRCRNDecorator'],
     }
 
     import sys
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index 69dec41e..c61446c2 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,3 +1,9 @@
+"""
+class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of
+Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
+from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
+"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/conv_stft.py b/modelscope/models/audio/ans/conv_stft.py
index a47d7817..4b393a4c 100644
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
index 59411fbe..b74fc273 100644
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict
 
@@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT
 from .unet import UNet
 
 
-class FTB(nn.Module):
-
-    def __init__(self, input_dim=257, in_channel=9, r_channel=5):
-
-        super(FTB, self).__init__()
-        self.in_channel = in_channel
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(r_channel), nn.ReLU())
-
-        self.conv1d = nn.Sequential(
-            nn.Conv1d(
-                r_channel * input_dim, in_channel, kernel_size=9, padding=4),
-            nn.BatchNorm1d(in_channel), nn.ReLU())
-        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)
-
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(in_channel), nn.ReLU())
-
-    def forward(self, inputs):
-        '''
-        inputs should be [Batch, Ca, Dim, Time]
-        '''
-        # T-F attention
-        conv1_out = self.conv1(inputs)
-        B, C, D, T = conv1_out.size()
-        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
-        conv1d_out = self.conv1d(reshape1_out)
-        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])
-
-        # now is also [B,C,D,T]
-        att_out = conv1d_out * inputs
-
-        # tranpose to [B,C,T,D]
-        att_out = torch.transpose(att_out, 2, 3)
-        freqfc_out = self.freq_fc(att_out)
-        att_out = torch.transpose(freqfc_out, 2, 3)
-
-        cat_out = torch.cat([att_out, inputs], 1)
-        outputs = self.conv2(cat_out)
-        return outputs
-
-
 @MODELS.register_module(
     Tasks.acoustic_noise_suppression,
     module_name=Models.speech_frcrn_ans_cirm_16k)
-class FRCRNModel(TorchModel):
+class FRCRNDecorator(TorchModel):
     r""" A decorator of FRCRN for integrating into modelscope framework """
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -78,13 +35,14 @@ class FRCRNModel(TorchModel):
             checkpoint = torch.load(
                 model_bin_file, map_location=torch.device('cpu'))
             if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
-                self.model.load_state_dict(
-                    checkpoint['state_dict'], strict=False)
+                # the new trained model by user is based on FRCRNDecorator
+                self.load_state_dict(checkpoint['state_dict'])
             else:
+                # The released model on Modelscope is based on FRCRN
                 self.model.load_state_dict(checkpoint, strict=False)
 
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        result_list = self.model.forward(input['noisy'])
+    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        result_list = self.model.forward(inputs['noisy'])
         output = {
             'spec_l1': result_list[0],
             'wav_l1': result_list[1],
@@ -93,12 +51,12 @@ class FRCRNModel(TorchModel):
             'wav_l2': result_list[4],
             'mask_l2': result_list[5]
         }
-        if 'clean' in input:
+        if 'clean' in inputs:
             mix_result = self.model.loss(
-                input['noisy'], input['clean'], result_list, mode='Mix')
+                inputs['noisy'], inputs['clean'], result_list, mode='Mix')
             output.update(mix_result)
             sisnr_result = self.model.loss(
-                input['noisy'], input['clean'], result_list, mode='SiSNR')
+                inputs['noisy'], inputs['clean'], result_list, mode='SiSNR')
             output.update(sisnr_result)
             # logger hooker will use items under 'log_vars'
             output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
diff --git a/modelscope/models/audio/ans/se_module_complex.py b/modelscope/models/audio/ans/se_module_complex.py
index f62fe523..b58eb6ba 100644
--- a/modelscope/models/audio/ans/se_module_complex.py
+++ b/modelscope/models/audio/ans/se_module_complex.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 from torch import nn
 
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index aa5a4254..ae66eb69 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,3 +1,7 @@
+"""
+Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
+from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+"""
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py
index f782b836..37b201ce 100644
--- a/modelscope/trainers/audio/ans_trainer.py
+++ b/modelscope/trainers/audio/ans_trainer.py
@@ -1,10 +1,5 @@
-import time
-from typing import List, Optional, Union
-
-from datasets import Dataset
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
-from modelscope.preprocessors import Preprocessor
 from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.constant import TrainerStages
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 14374c65..61964345 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,5 +1,4 @@
-import numpy as np
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 SEGMENT_LENGTH_TRAIN = 16000
 
 
@@ -9,16 +8,13 @@ def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN):
     It only works in batch mode.
     """
     noisy_arrays = []
-    for x in batch['noisy']:
-        length = len(x['array'])
-        noisy = np.array(x['array'])
-        for offset in range(segment_length, length, segment_length):
-            noisy_arrays.append(noisy[offset - segment_length:offset])
     clean_arrays = []
-    for x in batch['clean']:
-        length = len(x['array'])
-        clean = np.array(x['array'])
-        for offset in range(segment_length, length, segment_length):
+    for x, y in zip(batch['noisy'], batch['clean']):
+        length = min(len(x['array']), len(y['array']))
+        noisy = x['array']
+        clean = y['array']
+        for offset in range(segment_length, length + 1, segment_length):
+            noisy_arrays.append(noisy[offset - segment_length:offset])
             clean_arrays.append(clean[offset - segment_length:offset])
     return {'noisy': noisy_arrays, 'clean': clean_arrays}
 

From af4c6f70c296cbffdc6a5962791eed179ed611c7 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 1 Sep 2022 20:06:42 +0800
Subject: [PATCH 038/175] [to #42322933]allow none decorator registry in ast

---
 modelscope/utils/ast_utils.py | 65 ++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 990a9571..263a81b3 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -36,6 +36,7 @@ SCAN_SUB_FOLDERS = [
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
+EXPRESS_KEY = 'express'
 FROM_IMPORT_KEY = 'from_imports'
 IMPORT_KEY = 'imports'
 FILE_NAME_KEY = 'filepath'
@@ -45,6 +46,9 @@ INDEX_KEY = 'index'
 REQUIREMENT_KEY = 'requirements'
 MODULE_KEY = 'module'
 CLASS_NAME = 'class_name'
+GROUP_KEY = 'group_key'
+MODULE_NAME = 'module_name'
+MODULE_CLS = 'module_cls'
 
 
 class AstScaning(object):
@@ -53,6 +57,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.express = []
 
     def _is_sub_node(self, node: object) -> bool:
         return isinstance(node,
@@ -108,6 +113,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.result_express = []
 
     def scan_ast(self, node: Union[ast.AST, None, str]):
         self._setup_global()
@@ -243,13 +249,19 @@ class AstScaning(object):
                             setattr(item, CLASS_NAME, node.name)
                         self.result_decorator.extend(attr)
 
+                    if attr != [] and type(
+                            attr
+                    ).__name__ == 'Call' and parent_node_name == 'Expr':
+                        self.result_express.append(attr)
+
                     out += f'{indentstr()}{field}={representation},\n'
 
             out += indentstr() + ')'
             return {
                 IMPORT_KEY: self.result_import,
                 FROM_IMPORT_KEY: self.result_from_import,
-                DECORATOR_KEY: self.result_decorator
+                DECORATOR_KEY: self.result_decorator,
+                EXPRESS_KEY: self.result_express
             }, out
 
     def _parse_decorator(self, node: ast.AST) -> tuple:
@@ -267,7 +279,10 @@ class AstScaning(object):
         def _get_args_name(nodes: list) -> list:
             result = []
             for node in nodes:
-                result.append(_get_attribute_item(node))
+                if type(node).__name__ == 'Str':
+                    result.append((node.s, None))
+                else:
+                    result.append(_get_attribute_item(node))
             return result
 
         def _get_keyword_name(nodes: ast.AST) -> list:
@@ -276,9 +291,11 @@ class AstScaning(object):
                 if type(node).__name__ == 'keyword':
                     attribute_node = getattr(node, 'value')
                     if type(attribute_node).__name__ == 'Str':
-                        result.append((attribute_node.s, None))
+                        result.append((getattr(node,
+                                               'arg'), attribute_node.s, None))
                     else:
-                        result.append(_get_attribute_item(attribute_node))
+                        result.append((getattr(node, 'arg'), )
+                                      + _get_attribute_item(attribute_node))
             return result
 
         functions = _get_attribute_item(node.func)
@@ -315,10 +332,26 @@ class AstScaning(object):
             args_list.append(default_group)
         if len(keyword_list) == 0 and len(args_list) == 1:
             args_list.append(class_name)
-        if len(keyword_list) == 1 and len(args_list) == 0:
+
+        if len(keyword_list) > 0 and len(args_list) == 0:
+            remove_group_item = None
+            for item in keyword_list:
+                key, name, attr = item
+                if key == GROUP_KEY:
+                    args_list.append((name, attr))
+                    remove_group_item = item
+            if remove_group_item is not None:
+                keyword_list.remove(remove_group_item)
+
+        if len(args_list) == 0:
             args_list.append(default_group)
 
-        args_list.extend(keyword_list)
+        for item in keyword_list:
+            key, name, attr = item
+            if key == MODULE_CLS:
+                class_name = name
+            else:
+                args_list.append((name, attr))
 
         for item in args_list:
             # the case empty input
@@ -347,9 +380,14 @@ class AstScaning(object):
         for node in nodes:
             if type(node).__name__ != 'Call':
                 continue
+            class_name = getattr(node, CLASS_NAME, None)
+            func = getattr(node, 'func')
+
+            if getattr(func, 'attr', None) != REGISTER_MODULE:
+                continue
+
             parse_output = self._parse_decorator(node)
-            index = self._registry_indexer(parse_output,
-                                           getattr(node, CLASS_NAME))
+            index = self._registry_indexer(parse_output, class_name)
             if None is not index:
                 results.append(index)
         return results
@@ -363,6 +401,8 @@ class AstScaning(object):
         node = gast.parse(data)
         output, _ = self.scan_import(node, indent='  ', show_offsets=False)
         output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY])
+        output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY])
+        output[DECORATOR_KEY].extend(output[EXPRESS_KEY])
         return output
 
 
@@ -481,6 +521,13 @@ class FilesAstScaning(object):
             module_import[value_dict[MODULE_KEY]] = value_dict[IMPORT_KEY]
         return module_import
 
+    def _ignore_useless_keys(self, inverted_index):
+        if ('OPTIMIZERS', 'default', 'name') in inverted_index:
+            del inverted_index[('OPTIMIZERS', 'default', 'name')]
+        if ('LR_SCHEDULER', 'default', 'name') in inverted_index:
+            del inverted_index[('LR_SCHEDULER', 'default', 'name')]
+        return inverted_index
+
     def get_files_scan_results(self,
                                target_dir=MODELSCOPE_PATH,
                                target_folders=SCAN_SUB_FOLDERS):
@@ -514,6 +561,8 @@ class FilesAstScaning(object):
                 MODULE_KEY: module_name
             }
         inverted_index_with_results = self._inverted_index(result)
+        inverted_index_with_results = self._ignore_useless_keys(
+            inverted_index_with_results)
         module_import = self._module_import(result)
         index = {
             INDEX_KEY: inverted_index_with_results,

From 780330897a47bf24437090e48cf4350dae7af8ed Mon Sep 17 00:00:00 2001
From: "peter.lx" <peter.lx@alibaba-inc.com>
Date: Thu, 1 Sep 2022 22:17:14 +0800
Subject: [PATCH 039/175] [to #42322933] add Deberta v2 modeling and fill_mask
 task, with master merged

add Deberta v2 modeling and fill_mask task, with master merged
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9966511
---
 modelscope/metainfo.py                        |    1 +
 modelscope/models/nlp/__init__.py             |   16 +-
 modelscope/models/nlp/deberta_v2/__init__.py  |   73 +
 .../deberta_v2/configuration_deberta_v2.py    |  130 ++
 .../nlp/deberta_v2/modeling_deberta_v2.py     | 1789 +++++++++++++++++
 .../nlp/deberta_v2/tokenization_deberta_v2.py |  546 +++++
 .../tokenization_deberta_v2_fast.py           |  241 +++
 modelscope/models/nlp/masked_language.py      |   39 +
 .../pipelines/nlp/fill_mask_pipeline.py       |   16 +-
 modelscope/preprocessors/nlp.py               |    3 +
 tests/pipelines/test_deberta_tasks.py         |   62 +
 11 files changed, 2907 insertions(+), 9 deletions(-)
 create mode 100644 modelscope/models/nlp/deberta_v2/__init__.py
 create mode 100644 modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
 create mode 100644 tests/pipelines/test_deberta_tasks.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 7c5afe80..971dd3f1 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -37,6 +37,7 @@ class Models(object):
     bert = 'bert'
     palm = 'palm-v2'
     structbert = 'structbert'
+    deberta_v2 = 'deberta_v2'
     veco = 'veco'
     translation = 'csanmt-translation'
     space_dst = 'space-dst'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index e17a1d31..fd61e40b 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -9,12 +9,15 @@ if TYPE_CHECKING:
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
-    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
-                                  BertForMaskedLM)
+    from .masked_language import (
+        StructBertForMaskedLM,
+        VecoForMaskedLM,
+        BertForMaskedLM,
+        DebertaV2ForMaskedLM,
+    )
     from .nncrf_for_named_entity_recognition import (
         TransformerCRFForNamedEntityRecognition,
         LSTMCRFForNamedEntityRecognition)
-    from .palm_v2 import PalmForTextGeneration
     from .token_classification import SbertForTokenClassification
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
     from .space import SpaceForDialogIntent
@@ -22,7 +25,6 @@ if TYPE_CHECKING:
     from .space import SpaceForDialogStateTracking
     from .star_text_to_sql import StarForTextToSql
     from .task_models import (InformationExtractionModel,
-                              SequenceClassificationModel,
                               SingleBackboneTaskModelBase)
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
@@ -36,8 +38,10 @@ else:
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
-        'masked_language':
-        ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
+        'masked_language': [
+            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
+            'DebertaV2ForMaskedLM'
+        ],
         'nncrf_for_named_entity_recognition': [
             'TransformerCRFForNamedEntityRecognition',
             'LSTMCRFForNamedEntityRecognition'
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
new file mode 100644
index 00000000..664fc6c6
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -0,0 +1,73 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+_import_structure = {
+    'configuration_deberta_v2': [
+        'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
+        'DebertaV2OnnxConfig'
+    ],
+    'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
+}
+
+if TYPE_CHECKING:
+    from .configuration_deberta_v2 import DebertaV2Config
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
+
+    from .modeling_deberta_v2 import (
+        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DebertaV2ForMaskedLM,
+        DebertaV2ForMultipleChoice,
+        DebertaV2ForQuestionAnswering,
+        DebertaV2ForSequenceClassification,
+        DebertaV2ForTokenClassification,
+        DebertaV2Model,
+        DebertaV2PreTrainedModel,
+    )
+
+else:
+    _import_structure = {
+        'configuration_deberta_v2':
+        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
+        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
+    }
+    _import_structure['tokenization_deberta_v2_fast'] = [
+        'DebertaV2TokenizerFast'
+    ]
+    _import_structure['modeling_deberta_v2'] = [
+        'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
+        'DebertaV2ForMaskedLM',
+        'DebertaV2ForMultipleChoice',
+        'DebertaV2ForQuestionAnswering',
+        'DebertaV2ForSequenceClassification',
+        'DebertaV2ForTokenClassification',
+        'DebertaV2Model',
+        'DebertaV2PreTrainedModel',
+    ]
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__)
diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
new file mode 100644
index 00000000..65e8f0b7
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class DebertaV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
+    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 128100):
+            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DebertaV2Model`].
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
+            are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
+            The epsilon used by the layer normalization layers.
+        relative_attention (`bool`, *optional*, defaults to `True`):
+            Whether use relative position encoding.
+        max_relative_positions (`int`, *optional*, defaults to -1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
+            as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (`bool`, *optional*, defaults to `False`):
+            Whether add absolute position embedding to content embedding.
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
+            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    """
+    model_type = 'deberta_v2'
+
+    def __init__(self,
+                 vocab_size=128100,
+                 hidden_size=1536,
+                 num_hidden_layers=24,
+                 num_attention_heads=24,
+                 intermediate_size=6144,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=0,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-7,
+                 relative_attention=False,
+                 max_relative_positions=-1,
+                 pad_token_id=0,
+                 position_biased_input=True,
+                 pos_att_type=None,
+                 pooler_dropout=0,
+                 pooler_hidden_act='gelu',
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.relative_attention = relative_attention
+        self.max_relative_positions = max_relative_positions
+        self.pad_token_id = pad_token_id
+        self.position_biased_input = position_biased_input
+
+        # Backwards compatibility
+        if type(pos_att_type) == str:
+            pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')]
+
+        self.pos_att_type = pos_att_type
+        self.vocab_size = vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size)
+        self.pooler_dropout = pooler_dropout
+        self.pooler_hidden_act = pooler_hidden_act
diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
new file mode 100644
index 00000000..1c6b9071
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
@@ -0,0 +1,1789 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeBERTa-v2 model."""
+
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward)
+from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
+                                           MultipleChoiceModelOutput,
+                                           QuestionAnsweringModelOutput,
+                                           SequenceClassifierOutput,
+                                           TokenClassifierOutput)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import softmax_backward_data
+
+from modelscope.utils import logger as logging
+from .configuration_deberta_v2 import DebertaV2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'DebertaV2Config'
+_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
+_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
+
+
+# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
+class ContextPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.pooler_hidden_size,
+                               config.pooler_hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
+class XSoftmax(torch.autograd.Function):
+    """
+    Masked Softmax which is optimized for saving memory
+
+    Args:
+        input (`torch.tensor`): The input tensor that will apply softmax.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+
+    >>> # Make a tensor
+    >>> x = torch.randn([4, 20, 100])
+
+    >>> # Create a mask
+    >>> mask = (x > 0).int()
+
+    >>> # Specify the dimension to apply softmax
+    >>> dim = -1
+
+    >>> y = XSoftmax.apply(x, mask, dim)
+    ```"""
+
+    @staticmethod
+    def forward(self, input, mask, dim):
+        self.dim = dim
+        rmask = ~(mask.to(torch.bool))
+
+        output = input.masked_fill(rmask,
+                                   torch.tensor(torch.finfo(input.dtype).min))
+        output = torch.softmax(output, self.dim)
+        output.masked_fill_(rmask, 0)
+        self.save_for_backward(output)
+        return output
+
+    @staticmethod
+    def backward(self, grad_output):
+        (output, ) = self.saved_tensors
+        inputGrad = softmax_backward_data(self, grad_output, output, self.dim,
+                                          output)
+        return inputGrad, None, None
+
+    @staticmethod
+    def symbolic(g, self, mask, dim):
+        import torch.onnx.symbolic_helper as sym_help
+        from torch.onnx.symbolic_opset9 import masked_fill, softmax
+
+        mask_cast_value = g.op(
+            'Cast', mask, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+        r_mask = g.op(
+            'Cast',
+            g.op('Sub',
+                 g.op('Constant', value_t=torch.tensor(1, dtype=torch.int64)),
+                 mask_cast_value),
+            to_i=sym_help.cast_pytorch_to_onnx['Byte'],
+        )
+        output = masked_fill(
+            g, self, r_mask,
+            g.op(
+                'Constant',
+                value_t=torch.tensor(torch.finfo(self.type().dtype()).min)))
+        output = softmax(g, output, dim)
+        return masked_fill(
+            g, output, r_mask,
+            g.op('Constant', value_t=torch.tensor(0, dtype=torch.uint8)))
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
+class DropoutContext(object):
+
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+# Copied from transformers.models.deberta.modeling_deberta.get_mask
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(
+            torch.bool)
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XDropout
+class XDropout(torch.autograd.Function):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask, ) = ctx.saved_tensors
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+    @staticmethod
+    def symbolic(g: torch._C.Graph, input: torch._C.Value,
+                 local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
+        dropout_p = local_ctx
+        if isinstance(local_ctx, DropoutContext):
+            dropout_p = local_ctx.dropout
+        # StableDropout only calls this function when training.
+        train = True
+        # TODO: We should check if the opset_version being used to export
+        # is > 12 here, but there's no good way to do that. As-is, if the
+        # opset_version < 12, export will fail with a CheckerError.
+        # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
+        # if opset_version < 12:
+        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
+
+
+# Copied from transformers.models.deberta.modeling_deberta.StableDropout
+class StableDropout(nn.Module):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`torch.tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2SelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
+class DebertaV2Attention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaV2SelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            self_output, att_matrix = self_output
+        if query_states is None:
+            query_states = hidden_states
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
+class DebertaV2Intermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2Output(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
+class DebertaV2Layer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaV2Attention(config)
+        self.intermediate = DebertaV2Intermediate(config)
+        self.output = DebertaV2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, 'conv_kernel_size', 3)
+        groups = getattr(config, 'conv_groups', 1)
+        self.conv_act = getattr(config, 'conv_act', 'tanh')
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            groups=groups)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(
+            0, 2, 1).contiguous()
+        rmask = (1 - input_mask).bool()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.dim() != layer_norm_input.dim():
+                if input_mask.dim() == 4:
+                    input_mask = input_mask.squeeze(1).squeeze(1)
+                input_mask = input_mask.unsqueeze(2)
+
+            input_mask = input_mask.to(output.dtype)
+            output_states = output * input_mask
+
+        return output_states
+
+
+class DebertaV2Encoder(nn.Module):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.layer = nn.ModuleList(
+            [DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            pos_ebd_size = self.max_relative_positions * 2
+
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+
+            self.rel_embeddings = nn.Embedding(pos_ebd_size,
+                                               config.hidden_size)
+
+        self.norm_rel_ebd = [
+            x.strip()
+            for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')
+        ]
+
+        if 'layer_norm' in self.norm_rel_ebd:
+            self.LayerNorm = LayerNorm(
+                config.hidden_size,
+                config.layer_norm_eps,
+                elementwise_affine=True)
+
+        self.conv = ConvLayer(config) if getattr(config, 'conv_kernel_size',
+                                                 0) > 0 else None
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(
+                -2).unsqueeze(-1)
+            attention_mask = attention_mask.byte()
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.size(
+                -2) if query_states is not None else hidden_states.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                hidden_states.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=True,
+    ):
+        if attention_mask.dim() <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = (attention_mask.sum(-2) > 0).byte()
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states,
+                                        relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states, )
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                output_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                )
+            else:
+                output_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                output_states, att_m = output_states
+
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states,
+                                          input_mask)
+
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(
+                        self.layer) else None
+            else:
+                next_kv = output_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m, )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [output_states, all_hidden_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions)
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    sign = torch.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(
+            torch.log(abs_pos / mid)
+            / torch.log(torch.tensor(
+                (max_position - 1) / mid)) * (mid - 1)) + mid)
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos),
+                             log_pos * sign)
+    return bucket_pos
+
+
+def build_relative_position(query_size,
+                            key_size,
+                            bucket_size=-1,
+                            max_position=-1):
+    """
+    Build relative position according to the query and key
+
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+
+    Return:
+        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+
+    """
+    q_ids = torch.arange(0, query_size)
+    k_ids = torch.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size,
+                                               max_position)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        query_layer.size(2),
+        relative_pos.size(-1)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        key_layer.size(-2),
+        key_layer.size(-2)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return pos_index.expand(p2c_att.size()[:2]
+                            + (pos_index.size(-2), key_layer.size(-2)))
+
+
+class DisentangledSelfAttention(nn.Module):
+    """
+    Disentangled self-attention module
+
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, 'attention_head_size',
+                                           _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.key_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.value_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+
+        self.share_att_key = getattr(config, 'share_att_key', False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if not self.share_att_key:
+                if 'c2p' in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(
+                        config.hidden_size, self.all_head_size, bias=True)
+                if 'p2c' in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size,
+                                                    self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1),
+                                                       x.size(-1))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        """
+        Call the module
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
+                *Attention(Q,K,V)*
+
+            attention_mask (`torch.ByteTensor`):
+                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+                th token.
+
+            output_attentions (`bool`, optional):
+                Whether return the attention matrix.
+
+            query_states (`torch.FloatTensor`, optional):
+                The *Q* state in *Attention(Q,K,V)*.
+
+            relative_pos (`torch.LongTensor`):
+                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+                values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+            rel_embeddings (`torch.FloatTensor`):
+                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+                \\text{max_relative_positions}\\), *hidden_size*].
+
+
+        """
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(
+            self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(
+            self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(
+            self.value_proj(hidden_states), self.num_attention_heads)
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if 'c2p' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2c' in self.pos_att_type:
+            scale_factor += 1
+        scale = torch.sqrt(
+            torch.tensor(query_layer.size(-1), dtype=torch.float)
+            * scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(
+            -1, -2)) / torch.tensor(
+                scale, dtype=query_layer.dtype)
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(query_layer, key_layer,
+                                                       relative_pos,
+                                                       rel_embeddings,
+                                                       scale_factor)
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.view(-1, self.num_attention_heads,
+                                                 attention_scores.size(-2),
+                                                 attention_scores.size(-1))
+
+        # bsz x height x length x dimension
+        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.bmm(
+            attention_probs.view(-1, attention_probs.size(-2),
+                                 attention_probs.size(-1)), value_layer)
+        context_layer = (
+            context_layer.view(-1, self.num_attention_heads,
+                               context_layer.size(-2),
+                               context_layer.size(-1)).permute(0, 2, 1,
+                                                               3).contiguous())
+        new_context_layer_shape = context_layer.size()[:-2] + (-1, )
+        context_layer = context_layer.view(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos,
+                                    rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                key_layer.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        if relative_pos.dim() == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim() == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.dim() != 4:
+            raise ValueError(
+                f'Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}'
+            )
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+
+        rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+            pos_key_layer = self.transpose_for_scores(
+                self.key_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+        else:
+            if 'c2p' in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+            if 'p2c' in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+
+        score = 0
+        # content->position
+        if 'c2p' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_key_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = torch.gather(
+                c2p_att,
+                dim=-1,
+                index=c2p_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    query_layer.size(1),
+                    relative_pos.size(-1)
+                ]),
+            )
+            score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
+
+        # position->content
+        if 'p2c' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_query_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            if key_layer.size(-2) != query_layer.size(-2):
+                r_pos = build_relative_position(
+                    key_layer.size(-2),
+                    key_layer.size(-2),
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                ).to(query_layer.device)
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+
+            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.gather(
+                p2c_att,
+                dim=-1,
+                index=p2c_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    key_layer.size(-2),
+                    key_layer.size(-2)
+                ]),
+            ).transpose(-1, -2)
+            score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)
+
+        return score
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm
+class DebertaV2Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        pad_token_id = getattr(config, 'pad_token_id', 0)
+        self.embedding_size = getattr(config, 'embedding_size',
+                                      config.hidden_size)
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        self.position_biased_input = getattr(config, 'position_biased_input',
+                                             True)
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings, self.embedding_size)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                      self.embedding_size)
+
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(
+                self.embedding_size, config.hidden_size, bias=False)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                mask=None,
+                inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.position_embeddings is not None:
+            position_embeddings = self.position_embeddings(position_ids.long())
+        else:
+            position_embeddings = torch.zeros_like(inputs_embeds)
+
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings += position_embeddings
+        if self.config.type_vocab_size > 0:
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+
+        if self.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+
+        embeddings = self.LayerNorm(embeddings)
+
+        if mask is not None:
+            if mask.dim() != embeddings.dim():
+                if mask.dim() == 4:
+                    mask = mask.squeeze(1).squeeze(1)
+                mask = mask.unsqueeze(2)
+            mask = mask.to(embeddings.dtype)
+
+            embeddings = embeddings * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
+class DebertaV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DebertaV2Config
+    base_model_prefix = 'deberta'
+    _keys_to_ignore_on_load_missing = ['position_ids']
+    _keys_to_ignore_on_load_unexpected = ['position_embeddings']
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DebertaV2Encoder):
+            module.gradient_checkpointing = value
+
+
+DEBERTA_START_DOCSTRING = r"""
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+
+    Parameters:
+        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = DebertaV2Embeddings(config)
+        self.encoder = DebertaV2Encoder(config)
+        self.z_steps = 0
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError(
+            'The prune function is not implemented in DeBERTa model.')
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        encoded_layers = encoder_outputs[1]
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output, ) + encoder_outputs[
+                (1 if output_hidden_states else 2):]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states
+            if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """DeBERTa Model with a `language modeling` head on top.""",
+    DEBERTA_START_DOCSTRING)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[1:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, num_labels)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    # regression task
+                    loss_fn = nn.MSELoss()
+                    logits = logits.view(-1).to(labels.dtype)
+                    loss = loss_fn(logits, labels.view(-1))
+                elif labels.dim() == 1 or labels.size(-1) == 1:
+                    label_index = (labels >= 0).nonzero()
+                    labels = labels.long()
+                    if label_index.size(0) > 0:
+                        labeled_logits = torch.gather(
+                            logits, 0,
+                            label_index.expand(
+                                label_index.size(0), logits.size(1)))
+                        labels = torch.gather(labels, 0, label_index.view(-1))
+                        loss_fct = CrossEntropyLoss()
+                        loss = loss_fct(
+                            labeled_logits.view(-1, self.num_labels).float(),
+                            labels.view(-1))
+                    else:
+                        loss = torch.tensor(0).to(logits)
+                else:
+                    log_softmax = nn.LogSoftmax(-1)
+                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+            elif self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, 1)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.deberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
new file mode 100644
index 00000000..adb60288
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,546 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`"""
+
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as sp
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'}
+
+
+class DebertaV2Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [jieba](https://github.com/fxsjy/jieba).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                 **kwargs) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self._tokenizer = SPMTokenizer(
+            vocab_file,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs)
+        self.jieba = None
+        if self.split_chinese:
+            try:
+                import jieba
+            except ImportError:
+                raise ImportError(
+                    'You need to install jieba to split chinese and use DebertaV2Tokenizer. '
+                    'See https://pypi.org/project/jieba/ for installation.')
+            self.jieba = jieba
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        if self.split_chinese:
+            seg_list = [x for x in self.jieba.cut(text)]
+            text = ' '.join(seg_list)
+        return self._tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self._tokenizer.spm.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self._tokenizer.spm.IdToPiece(
+            index) if index < self.vocab_size else self.unk_token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        return self._tokenizer.decode(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def prepare_for_tokenization(self,
+                                 text,
+                                 is_split_into_words=False,
+                                 **kwargs):
+        add_prefix_space = kwargs.pop('add_prefix_space', False)
+        if is_split_into_words or add_prefix_space:
+            text = ' ' + text
+        return (text, kwargs)
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return self._tokenizer.save_pretrained(
+            save_directory, filename_prefix=filename_prefix)
+
+
+class SPMTokenizer:
+    r"""
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    def __init__(self,
+                 vocab_file,
+                 split_by_punct=False,
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f'{vocab_file} does not exist!')
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        # Token map
+        # <unk> 0+1
+        # <s> 1+1
+        # </s> 2+1
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        # self.vocab['[PAD]'] = 0
+        # self.vocab['[CLS]'] = 1
+        # self.vocab['[SEP]'] = 2
+        # self.vocab['[UNK]'] = 3
+
+        self.spm = spm
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['spm'] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, 'sp_model_kwargs'):
+            self.sp_model_kwargs = {}
+
+        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.spm.Load(self.vocab_file)
+
+    def tokenize(self, text):
+        return self._encode_as_pieces(text)
+
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            return self.spm.decode_pieces([t for t in tokens])
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = ''.join(words[word_start:word_end])
+            return text
+
+    def add_special_token(self, token):
+        if token not in self.special_tokens:
+            self.special_tokens.append(token)
+            if token not in self.vocab:
+                self.vocab[token] = len(self.vocab) - 1
+                self.ids_to_tokens.append(token)
+        return self.id(token)
+
+    def part_of_whole_word(self, token, is_bos=False):
+        if is_bos:
+            return True
+        if (len(token) == 1 and (_is_whitespace(list(token)[0]))):
+            return False
+        if _is_control(list(token)[0]):
+            return False
+        if _is_punctuation(list(token)[0]):
+            return False
+        if token in self.add_special_token:
+            return False
+
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        return not token.startswith(word_start)
+
+    def pad(self):
+        return '[PAD]'
+
+    def bos(self):
+        return '[CLS]'
+
+    def eos(self):
+        return '[SEP]'
+
+    def unk(self):
+        return '[UNK]'
+
+    def mask(self):
+        return '[MASK]'
+
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+
+    def id(self, sym):
+        return self.vocab[sym] if sym in self.vocab else 1
+
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode(w, out_type=str) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode(text, out_type=str)
+
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, '')
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ''
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, '')
+                    if len(pn) > 0:
+                        break
+                    k += 1
+
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+
+        return words
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def save_pretrained(self, path: str, filename_prefix: str = None):
+        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+        if filename_prefix is not None:
+            filename = filename_prefix + '-' + filename
+        full_path = os.path.join(path, filename)
+        with open(full_path, 'wb') as fs:
+            fs.write(self.spm.serialized_model_proto())
+        return (full_path, )
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
+            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode('utf-8', 'ignore')
+    else:
+        raise ValueError(f'Unsupported string type: {type(text)}')
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
new file mode 100644
index 00000000..a1fcecf4
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
@@ -0,0 +1,241 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model DeBERTa."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils import logger as logging
+
+if is_sentencepiece_available():
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+else:
+    DebertaV2Tokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'spm.model',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [rjieba-py](https://github.com/messense/rjieba-py).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = DebertaV2Tokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 **kwargs) -> None:
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            **kwargs,
+        )
+
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.')
+
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 17324be9..4f466c23 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.deberta_v2 import \
+    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
 from modelscope.models.nlp.veco import \
     VecoForMaskedLM as VecoForMaskedLMTransformer
@@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
                      VecoForMaskedLM).from_pretrained(
                          pretrained_model_name_or_path=model_dir,
                          model_dir=model_dir)
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
+class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
+    """Deberta v2 for MLM model.
+
+    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        DebertaV2ForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = DebertaV2ForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(DebertaV2ForMaskedLMTransformer,
+                     DebertaV2ForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 60a9631b..caba4122 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -13,7 +13,10 @@ from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FillMaskPipeline']
-_type_map = {'veco': 'roberta', 'sbert': 'bert'}
+_type_map = {
+    'veco': 'roberta',
+    'sbert': 'bert',
+}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
@@ -65,7 +68,7 @@ class FillMaskPipeline(Pipeline):
         self.config = Config.from_file(
             os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
         self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103}
+        self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4}
 
         self.rep_map = {
             'bert': {
@@ -85,7 +88,14 @@ class FillMaskPipeline(Pipeline):
                 '<s>': '',
                 '</s>': '',
                 '<unk>': ' '
-            }
+            },
+            'deberta_v2': {
+                '[PAD]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
         }
 
     def forward(self, inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 4882c477..825611d6 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -170,6 +170,9 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         elif model_type == Models.veco:
             from modelscope.models.nlp.veco import VecoTokenizer
             return VecoTokenizer.from_pretrained(model_dir)
+        elif model_type == Models.deberta_v2:
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer
+            return DebertaV2Tokenizer.from_pretrained(model_dir)
         else:
             return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
 
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
new file mode 100644
index 00000000..4f3206cd
--- /dev/null
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import DebertaV2ForMaskedLM
+from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer,
+                                              DebertaV2TokenizerFast)
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FillMaskPipeline
+from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class DeBERTaV2TaskTest(unittest.TestCase):
+    model_id_deberta = 'damo/nlp_debertav2_fill-mask_chinese-lite'
+
+    ori_text = '你师父差得动你，你师父可差不动我。'
+    test_input = '你师父差得动你，你师父可[MASK]不动我。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id_deberta)
+        preprocessor = FillMaskPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        # sbert
+        print(self.model_id_deberta)
+        model = Model.from_pretrained(self.model_id_deberta)
+        preprocessor = FillMaskPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        print(
+            f'\nori_text: {self.ori_text}\ninput: {self.test_input}\npipeline: '
+            f'{pipeline_ins(self.test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=self.model_id_deberta)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+              f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9e14d6727b7583fed29f0684a1171754a505388d Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Fri, 2 Sep 2022 11:02:43 +0800
Subject: [PATCH 040/175] [to #44571845]fix: ci support multiple image        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9974293

---
 .dev_scripts/ci_container_test.sh |   3 -
 .dev_scripts/dockerci.sh          |   5 +-
 requirements/tensorflow1x.txt     |   1 +
 tests/isolated_cases.txt          |   6 -
 tests/run.py                      | 191 ++++++++++++++++++++----------
 tests/run_config.yaml             |  31 +++++
 6 files changed, 165 insertions(+), 72 deletions(-)
 create mode 100644 requirements/tensorflow1x.txt
 delete mode 100644 tests/isolated_cases.txt
 create mode 100644 tests/run_config.yaml

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 2f18aff7..a53c08c6 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -4,8 +4,6 @@ pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs
 pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
 
 git config --global --add safe.directory /Maas-lib
 
@@ -26,4 +24,3 @@ else
 fi
 echo "Running case with command: $ci_command"
 $ci_command
-#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index dbb79514..e76f2f14 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
-CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
+# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
 echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
@@ -16,6 +17,7 @@ do
   echo "get gpu lock $gpu"
   CONTAINER_NAME="modelscope-ci-$gpu"
   let is_get_file_lock=true
+
   # pull image if there are update
   docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
   docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
@@ -38,6 +40,7 @@ do
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
              $CI_COMMAND
+
   if [ $? -ne 0 ]; then
     echo "Running test case failed, please check the log!"
     exit -1
diff --git a/requirements/tensorflow1x.txt b/requirements/tensorflow1x.txt
new file mode 100644
index 00000000..b139efe1
--- /dev/null
+++ b/requirements/tensorflow1x.txt
@@ -0,0 +1 @@
+numpy==1.18.5
diff --git a/tests/isolated_cases.txt b/tests/isolated_cases.txt
deleted file mode 100644
index be85142a..00000000
--- a/tests/isolated_cases.txt
+++ /dev/null
@@ -1,6 +0,0 @@
- test_text_to_speech.py
- test_multi_modal_embedding.py
- test_ofa_tasks.py
- test_video_summarization.py
- test_dialog_modeling.py
- test_csanmt_translation.py
diff --git a/tests/run.py b/tests/run.py
index 79509745..478cb9d6 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -21,6 +21,7 @@ import pandas
 #         if 'import tensorflow' in front of 'import torch'.
 #         Puting a 'import torch' here can bypass this incompatibility.
 import torch
+import yaml
 
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import set_test_level, test_level
@@ -61,6 +62,7 @@ def statistics_test_result(df):
         result, total_cases, success_cases, failures_cases, error_cases,
         skipped_cases, expected_failure_cases, unexpected_success_cases)
 
+    print('Testing result summary.')
     print(result_msg)
     if result == 'FAILED':
         sys.exit(1)
@@ -88,6 +90,7 @@ def gather_test_suites_files(test_dir, pattern):
         for file in filenames:
             if fnmatch(file, pattern):
                 case_file_list.append(file)
+
     return case_file_list
 
 
@@ -125,18 +128,6 @@ def collect_test_results(case_results):
     return result_list
 
 
-class TestSuiteRunner:
-
-    def run(self, msg_queue, test_dir, test_suite_file):
-        test_suite = unittest.TestSuite()
-        test_case = unittest.defaultTestLoader.discover(
-            start_dir=test_dir, pattern=test_suite_file)
-        test_suite.addTest(test_case)
-        runner = TimeCostTextTestRunner()
-        test_suite_result = runner.run(test_suite)
-        msg_queue.put(collect_test_results(test_suite_result))
-
-
 def run_command_with_popen(cmd):
     with subprocess.Popen(
             cmd,
@@ -148,55 +139,126 @@ def run_command_with_popen(cmd):
             sys.stdout.write(line)
 
 
+def save_test_result(df, args):
+    if args.result_dir is not None:
+        file_name = str(int(datetime.datetime.now().timestamp() * 1000))
+        os.umask(0)
+        Path(args.result_dir).mkdir(mode=0o777, parents=True, exist_ok=True)
+        Path(os.path.join(args.result_dir, file_name)).touch(
+            mode=0o666, exist_ok=True)
+        df.to_pickle(os.path.join(args.result_dir, file_name))
+
+
+def run_command(cmd):
+    logger.info('Running command: %s' % ' '.join(cmd))
+    response = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        response.check_returncode()
+        logger.info(response.stdout.decode('utf8'))
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            'stdout: %s, stderr: %s' %
+            (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+
+
+def install_packages(pkgs):
+    cmd = [sys.executable, '-m', 'pip', 'install']
+    for pkg in pkgs:
+        cmd.append(pkg)
+
+    run_command(cmd)
+
+
+def install_requirements(requirements):
+    for req in requirements:
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', '-r',
+            'requirements/%s' % req, '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        run_command(cmd)
+
+
+def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
+                    result_dir):
+    # install requirements and deps # run_config['envs'][env]
+    if 'requirements' in env:
+        install_requirements(env['requirements'])
+    if 'dependencies' in env:
+        install_packages(env['dependencies'])
+
+    for test_suite_file in isolated_cases:  # run case in subprocess
+        if test_suite_file in test_suite_env_map and test_suite_env_map[
+                test_suite_file] == env_name:
+            cmd = [
+                'python',
+                'tests/run.py',
+                '--pattern',
+                test_suite_file,
+                '--result_dir',
+                result_dir,
+            ]
+            run_command_with_popen(cmd)
+        else:
+            pass  # case not in run list.
+
+    # run remain cases in a process.
+    remain_suite_files = []
+    for k, v in test_suite_env_map.items():
+        if k not in isolated_cases and v == env_name:
+            remain_suite_files.append(k)
+    if len(remain_suite_files) == 0:
+        return
+    cmd = ['python', 'tests/run.py', '--result_dir', result_dir, '--suites']
+    for suite in remain_suite_files:
+        cmd.append(suite)
+    run_command_with_popen(cmd)
+
+
 def run_in_subprocess(args):
     # only case args.isolated_cases run in subporcess, all other run in a subprocess
     test_suite_files = gather_test_suites_files(
         os.path.abspath(args.test_dir), args.pattern)
+    run_config = None
+    isolated_cases = []
+    test_suite_env_map = {}
+    # put all the case in default env.
+    for test_suite_file in test_suite_files:
+        test_suite_env_map[test_suite_file] = 'default'
+
+    if args.run_config is not None and Path(args.run_config).exists():
+        with open(args.run_config) as f:
+            run_config = yaml.load(f, Loader=yaml.FullLoader)
+        if 'isolated' in run_config:
+            isolated_cases = run_config['isolated']
+
+        if 'envs' in run_config:
+            for env in run_config['envs']:
+                if env != 'default':
+                    for test_suite in run_config['envs'][env]['tests']:
+                        if test_suite in test_suite_env_map:
+                            test_suite_env_map[test_suite] = env
 
     if args.subprocess:  # run all case in subprocess
         isolated_cases = test_suite_files
-    else:
-        isolated_cases = []
-        with open(args.isolated_cases, 'r') as f:
-            for line in f:
-                if line.strip() in test_suite_files:
-                    isolated_cases.append(line.strip())
 
-    if not args.list_tests:
-        with tempfile.TemporaryDirectory() as temp_result_dir:
-            for test_suite_file in isolated_cases:  # run case in subprocess
-                cmd = [
-                    'python', 'tests/run.py', '--pattern', test_suite_file,
-                    '--result_dir', temp_result_dir
-                ]
-                run_command_with_popen(cmd)
-            result_dfs = []
-            # run remain cases in a process.
-            remain_suite_files = [
-                item for item in test_suite_files if item not in isolated_cases
-            ]
-            test_suite = gather_test_suites_in_files(args.test_dir,
-                                                     remain_suite_files,
-                                                     args.list_tests)
-            if test_suite.countTestCases() > 0:
-                runner = TimeCostTextTestRunner()
-                result = runner.run(test_suite)
-                result = collect_test_results(result)
-                df = test_cases_result_to_df(result)
+    with tempfile.TemporaryDirectory() as temp_result_dir:
+        for env in set(test_suite_env_map.values()):
+            run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
+                            isolated_cases, temp_result_dir)
+
+        result_dfs = []
+        result_path = Path(temp_result_dir)
+        for result in result_path.iterdir():
+            if Path.is_file(result):
+                df = pandas.read_pickle(result)
                 result_dfs.append(df)
-
-            # collect test results
-            result_path = Path(temp_result_dir)
-            for result in result_path.iterdir():
-                if Path.is_file(result):
-                    df = pandas.read_pickle(result)
-                    result_dfs.append(df)
-
-            result_pd = pandas.concat(
-                result_dfs)  # merge result of every test suite.
-            print_table_result(result_pd)
-            print_abnormal_case_info(result_pd)
-            statistics_test_result(result_pd)
+        result_pd = pandas.concat(
+            result_dfs)  # merge result of every test suite.
+        print_table_result(result_pd)
+        print_abnormal_case_info(result_pd)
+        statistics_test_result(result_pd)
 
 
 def get_object_full_name(obj):
@@ -293,15 +355,19 @@ def print_table_result(df):
 
 def main(args):
     runner = TimeCostTextTestRunner()
-    test_suite = gather_test_cases(
-        os.path.abspath(args.test_dir), args.pattern, args.list_tests)
+    if args.suites is not None and len(args.suites) > 0:
+        logger.info('Running: %s' % ' '.join(args.suites))
+        test_suite = gather_test_suites_in_files(args.test_dir, args.suites,
+                                                 args.list_tests)
+    else:
+        test_suite = gather_test_cases(
+            os.path.abspath(args.test_dir), args.pattern, args.list_tests)
     if not args.list_tests:
         result = runner.run(test_suite)
         result = collect_test_results(result)
         df = test_cases_result_to_df(result)
         if args.result_dir is not None:
-            file_name = str(int(datetime.datetime.now().timestamp() * 1000))
-            df.to_pickle(os.path.join(args.result_dir, file_name))
+            save_test_result(df, args)
         else:
             print_table_result(df)
             print_abnormal_case_info(df)
@@ -321,9 +387,9 @@ if __name__ == '__main__':
     parser.add_argument(
         '--disable_profile', action='store_true', help='disable profiling')
     parser.add_argument(
-        '--isolated_cases',
+        '--run_config',
         default=None,
-        help='specified isolated cases config file')
+        help='specified case run config file(yaml file)')
     parser.add_argument(
         '--subprocess',
         action='store_true',
@@ -332,6 +398,10 @@ if __name__ == '__main__':
         '--result_dir',
         default=None,
         help='Save result to directory, internal use only')
+    parser.add_argument(
+        '--suites',
+        nargs='*',
+        help='Run specified test suites(test suite file list)')
     args = parser.parse_args()
     set_test_level(args.level)
     os.environ['REGRESSION_BASELINE'] = '1'
@@ -340,10 +410,7 @@ if __name__ == '__main__':
         from utils import profiler
         logger.info('enable profile ...')
         profiler.enable()
-    if args.isolated_cases is not None or args.subprocess:
+    if args.run_config is not None or args.subprocess:
         run_in_subprocess(args)
-    elif args.isolated_cases is not None and args.subprocess:
-        print('isolated_cases and subporcess conflict')
-        sys.exit(1)
     else:
         main(args)
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
new file mode 100644
index 00000000..591dcd66
--- /dev/null
+++ b/tests/run_config.yaml
@@ -0,0 +1,31 @@
+# envs option allows fine-grained control for test executoin, for example,
+# python tests/run.py --env pytorch
+# would only trigger exeutions of all pytorch cases.
+# envs option defaults to None for backward compatbility
+isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
+  - test_text_to_speech.py
+  - test_multi_modal_embedding.py
+  - test_ofa_tasks.py
+  - test_video_summarization.py
+  - test_dialog_modeling.py
+  - test_csanmt_translation.py
+
+envs:
+  default: # default env, case not in other env will in default, pytorch.
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy>=1.20
+  tensorflow1x: #  cases excuted  tensorflow1.x framework.
+    requirements: # requirements files run before test case run.
+      - tensorflow1x.txt
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy==1.18.5
+    tests:
+      - test_text_to_speech.py
+      - test_csanmt_translation.py
+      - test_translation_trainer.py
+      - test_ocr_detection.py
+      - test_automatic_speech_recognition.py
+      - test_image_matting.py
+      - test_person_image_cartoon.py
+      - test_skin_retouching.py
+      - test_image_style_transfer.py

From 1bac4f3349cbd1c343f4fbe1d9ec80198afd1a32 Mon Sep 17 00:00:00 2001
From: "xianzhe.xxz" <xianzhe.xxz@alibaba-inc.com>
Date: Fri, 2 Sep 2022 13:10:31 +0800
Subject: [PATCH 041/175] [to #42322933]add tinynas-detection pipeline and
 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

接入tinynas-detection，新增tinynas object detection pipeline以及tinynas models。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9938220
---
 modelscope/metainfo.py                        |   3 +
 .../models/cv/tinynas_detection/__init__.py   |  24 +
 .../cv/tinynas_detection/backbone/__init__.py |  16 +
 .../cv/tinynas_detection/backbone/darknet.py  | 126 ++++
 .../cv/tinynas_detection/backbone/tinynas.py  | 347 +++++++++
 .../cv/tinynas_detection/core/__init__.py     |   2 +
 .../cv/tinynas_detection/core/base_ops.py     | 474 +++++++++++++
 .../cv/tinynas_detection/core/neck_ops.py     | 324 +++++++++
 .../cv/tinynas_detection/core/repvgg_block.py | 205 ++++++
 .../models/cv/tinynas_detection/core/utils.py | 196 ++++++
 .../models/cv/tinynas_detection/detector.py   | 181 +++++
 .../cv/tinynas_detection/head/__init__.py     |  16 +
 .../tinynas_detection/head/gfocal_v2_tiny.py  | 361 ++++++++++
 .../cv/tinynas_detection/neck/__init__.py     |  16 +
 .../tinynas_detection/neck/giraffe_config.py  | 235 +++++++
 .../cv/tinynas_detection/neck/giraffe_fpn.py  | 661 ++++++++++++++++++
 .../tinynas_detection/neck/giraffe_fpn_v2.py  | 203 ++++++
 .../cv/tinynas_detection/tinynas_detector.py  |  16 +
 .../models/cv/tinynas_detection/utils.py      |  30 +
 .../cv/tinynas_detection_pipeline.py          |  61 ++
 tests/pipelines/test_tinynas_detection.py     |  20 +
 21 files changed, 3517 insertions(+)
 create mode 100644 modelscope/models/cv/tinynas_detection/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/backbone/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/backbone/darknet.py
 create mode 100755 modelscope/models/cv/tinynas_detection/backbone/tinynas.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/base_ops.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/neck_ops.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/repvgg_block.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/utils.py
 create mode 100644 modelscope/models/cv/tinynas_detection/detector.py
 create mode 100644 modelscope/models/cv/tinynas_detection/head/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
 create mode 100644 modelscope/models/cv/tinynas_detection/tinynas_detector.py
 create mode 100644 modelscope/models/cv/tinynas_detection/utils.py
 create mode 100644 modelscope/pipelines/cv/tinynas_detection_pipeline.py
 create mode 100644 tests/pipelines/test_tinynas_detection.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 971dd3f1..fd653bac 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -9,6 +9,8 @@ class Models(object):
 
         Model name should only contain model info but not task info.
     """
+    tinynas_detection = 'tinynas-detection'
+
     # vision models
     detection = 'detection'
     realtime_object_detection = 'realtime-object-detection'
@@ -133,6 +135,7 @@ class Pipelines(object):
     image_to_image_generation = 'image-to-image-generation'
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
+    tinynas_detection = 'tinynas-detection'
     crowd_counting = 'hrnet-crowd-counting'
     action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
diff --git a/modelscope/models/cv/tinynas_detection/__init__.py b/modelscope/models/cv/tinynas_detection/__init__.py
new file mode 100644
index 00000000..13532d10
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .tinynas_detector import Tinynas_detector
+
+else:
+    _import_structure = {
+        'tinynas_detector': ['TinynasDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/tinynas_detection/backbone/__init__.py b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
new file mode 100644
index 00000000..186d06a3
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .darknet import CSPDarknet
+from .tinynas import load_tinynas_net
+
+
+def build_backbone(cfg):
+    backbone_cfg = copy.deepcopy(cfg)
+    name = backbone_cfg.pop('name')
+    if name == 'CSPDarknet':
+        return CSPDarknet(**backbone_cfg)
+    elif name == 'TinyNAS':
+        return load_tinynas_net(backbone_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/backbone/darknet.py b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
new file mode 100644
index 00000000..d3294f0d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
@@ -0,0 +1,126 @@
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+from torch import nn
+
+from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super(CSPDarknet, self).__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        # self.stem = Focus(3, base_channels, ksize=3, act=act)
+        self.stem = Focus(3, base_channels, 3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+    def init_weights(self, pretrain=None):
+
+        if pretrain is None:
+            return
+        else:
+            pretrained_dict = torch.load(
+                pretrain, map_location='cpu')['state_dict']
+            new_params = self.state_dict().copy()
+            for k, v in pretrained_dict.items():
+                ks = k.split('.')
+                if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[
+                        -1] == 'total_params':
+                    continue
+                else:
+                    new_params[k] = v
+
+            self.load_state_dict(new_params)
+            print(f' load pretrain backbone from {pretrain}')
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        features_out = [
+            outputs['stem'], outputs['dark2'], outputs['dark3'],
+            outputs['dark4'], outputs['dark5']
+        ]
+
+        return features_out
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
new file mode 100755
index 00000000..814ee550
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -0,0 +1,347 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import Focus, SPPBottleneck, get_activation
+from ..core.repvgg_block import RepVggBlock
+
+
+class ConvKXBN(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride):
+        super(ConvKXBN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_c,
+            out_c,
+            kernel_size,
+            stride, (kernel_size - 1) // 2,
+            groups=1,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(out_c)
+
+    def forward(self, x):
+        return self.bn1(self.conv1(x))
+
+
+class ConvKXBNRELU(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
+        super(ConvKXBNRELU, self).__init__()
+        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+    def forward(self, x):
+        output = self.conv(x)
+        return self.activation_function(output)
+
+
+class ResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvK1KX, self).__init__()
+        self.stride = stride
+        self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
+        self.conv2 = RepVggBlock(
+            btn_c, out_c, kernel_size, stride, act='identity')
+
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+        if stride == 2:
+            self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+        else:
+            self.residual_downsample = nn.Identity()
+
+        if in_c != out_c or force_resproj:
+            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+        else:
+            self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride != 2:
+            reslink = self.residual_downsample(x)
+            reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+        if self.stride != 2:
+            output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvK1KX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvK1KX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class ResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvKXKX, self).__init__()
+        self.stride = stride
+        if self.stride == 2:
+            self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
+        else:
+            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
+            self.conv2 = RepVggBlock(
+                btn_c, out_c, kernel_size, stride, act='identity')
+
+            if act is None:
+                self.activation_function = torch.relu
+            else:
+                self.activation_function = get_activation(act)
+
+            if stride == 2:
+                self.residual_downsample = nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+            else:
+                self.residual_downsample = nn.Identity()
+
+            if in_c != out_c or force_resproj:
+                self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+            else:
+                self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride == 2:
+            return self.downsampler(x)
+        reslink = self.residual_downsample(x)
+        reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+
+        output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvKXKX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvKXKX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class TinyNAS(nn.Module):
+
+    def __init__(self,
+                 structure_info=None,
+                 out_indices=[0, 1, 2, 4, 5],
+                 out_channels=[None, None, 128, 256, 512],
+                 with_spp=False,
+                 use_focus=False,
+                 need_conv1=True,
+                 act='silu'):
+        super(TinyNAS, self).__init__()
+        assert len(out_indices) == len(out_channels)
+        self.out_indices = out_indices
+        self.need_conv1 = need_conv1
+
+        self.block_list = nn.ModuleList()
+        if need_conv1:
+            self.conv1_list = nn.ModuleList()
+        for idx, block_info in enumerate(structure_info):
+            the_block_class = block_info['class']
+            if the_block_class == 'ConvKXBNRELU':
+                if use_focus:
+                    the_block = Focus(block_info['in'], block_info['out'],
+                                      block_info['k'])
+                else:
+                    the_block = ConvKXBNRELU(
+                        block_info['in'],
+                        block_info['out'],
+                        block_info['k'],
+                        block_info['s'],
+                        act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvK1KX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvK1KX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvKXKX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvKXKX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            if need_conv1:
+                if idx in self.out_indices and out_channels[
+                        self.out_indices.index(idx)] is not None:
+                    self.conv1_list.append(
+                        nn.Conv2d(block_info['out'],
+                                  out_channels[self.out_indices.index(idx)],
+                                  1))
+                else:
+                    self.conv1_list.append(None)
+
+    def init_weights(self, pretrain=None):
+        pass
+
+    def forward(self, x):
+        output = x
+        stage_feature_list = []
+        for idx, block in enumerate(self.block_list):
+            output = block(output)
+            if idx in self.out_indices:
+                if self.need_conv1 and self.conv1_list[idx] is not None:
+                    true_out = self.conv1_list[idx](output)
+                    stage_feature_list.append(true_out)
+                else:
+                    stage_feature_list.append(output)
+        return stage_feature_list
+
+
+def load_tinynas_net(backbone_cfg):
+    # load masternet model to path
+    import ast
+
+    struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
+    struct_info = ast.literal_eval(struct_str)
+    for layer in struct_info:
+        if 'nbitsA' in layer:
+            del layer['nbitsA']
+        if 'nbitsW' in layer:
+            del layer['nbitsW']
+
+    model = TinyNAS(
+        structure_info=struct_info,
+        out_indices=backbone_cfg.out_indices,
+        out_channels=backbone_cfg.out_channels,
+        with_spp=backbone_cfg.with_spp,
+        use_focus=backbone_cfg.use_focus,
+        act=backbone_cfg.act,
+        need_conv1=backbone_cfg.need_conv1,
+    )
+
+    return model
diff --git a/modelscope/models/cv/tinynas_detection/core/__init__.py b/modelscope/models/cv/tinynas_detection/core/__init__.py
new file mode 100644
index 00000000..3dad5e72
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
diff --git a/modelscope/models/cv/tinynas_detection/core/base_ops.py b/modelscope/models/cv/tinynas_detection/core/base_ops.py
new file mode 100644
index 00000000..62729ca2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py
@@ -0,0 +1,474 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .repvgg_block import RepVggBlock
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def get_norm(name, out_channels, inplace=True):
+    if name == 'bn':
+        module = nn.BatchNorm2d(out_channels)
+    elif name == 'gn':
+        module = nn.GroupNorm(num_channels=out_channels, num_groups=32)
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=1,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        if norm is not None:
+            self.bn = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.with_norm:
+            # x = self.norm(x)
+            x = self.bn(x)
+        if self.with_act:
+            x = self.act(x)
+        return x
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DepthWiseConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=None,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        padding = (ksize - 1) // 2
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+
+        self.pointwise = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias)
+        if norm is not None:
+            self.dwnorm = get_norm(norm, in_channels, inplace=True)
+            self.pwnorm = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+        self.order = ['depthwise', 'dwnorm', 'pointwise', 'act']
+
+    def forward(self, x):
+
+        for layer_name in self.order:
+            layer = self.__getattr__(layer_name)
+            if layer is not None:
+                x = layer(x)
+        return x
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        k_conv1 = 3 if reparam else 1
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, k_conv1, stride=1, act=act)
+        if reparam:
+            self.conv2 = RepVggBlock(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        else:
+            self.conv2 = Conv(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act,
+                reparam=reparam) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class fast_Focus(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super(Focus, self).__init__()
+        self.conv1 = self.focus_conv(w1=1.0)
+        self.conv2 = self.focus_conv(w3=1.0)
+        self.conv3 = self.focus_conv(w2=1.0)
+        self.conv4 = self.focus_conv(w4=1.0)
+
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        return self.conv(
+            torch.cat(
+                [self.conv1(x),
+                 self.conv2(x),
+                 self.conv3(x),
+                 self.conv4(x)], 1))
+
+    def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False)
+        conv.weight = self.init_weights_constant(w1, w2, w3, w4)
+        conv.weight.requires_grad = False
+        return conv
+
+    def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        return nn.Parameter(
+            torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]],
+                          [[[w1, w2], [w3, w4]]]]))
+
+
+# shufflenet block
+def channel_shuffle(x, groups=2):
+    bat_size, channels, w, h = x.shape
+    group_c = channels // groups
+    x = x.view(bat_size, groups, group_c, w, h)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(bat_size, -1, w, h)
+    return x
+
+
+def conv_1x1_bn(in_c, out_c, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+def conv_bn(in_c, out_c, stride=2):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+class ShuffleBlock(nn.Module):
+
+    def __init__(self, in_c, out_c, downsample=False):
+        super(ShuffleBlock, self).__init__()
+        self.downsample = downsample
+        half_c = out_c // 2
+        if downsample:
+            self.branch1 = nn.Sequential(
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False),
+                nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False),
+                nn.BatchNorm2d(in_c),
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False),
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+        else:
+            # in_c = out_c
+            assert in_c == out_c
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 1
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+    def forward(self, x):
+        out = None
+        if self.downsample:
+            # if it is downsampling, we don't need to do channel split
+            out = torch.cat((self.branch1(x), self.branch2(x)), 1)
+        else:
+            # channel split
+            channels = x.shape[1]
+            c = channels // 2
+            x1 = x[:, :c, :, :]
+            x2 = x[:, c:, :, :]
+            out = torch.cat((x1, self.branch2(x2)), 1)
+        return channel_shuffle(out, 2)
+
+
+class ShuffleCSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        # add channel shuffle
+        return channel_shuffle(x, 2)
diff --git a/modelscope/models/cv/tinynas_detection/core/neck_ops.py b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
new file mode 100644
index 00000000..7f481665
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
@@ -0,0 +1,324 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Swish(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            x.mul_(F.sigmoid(x))
+            return x
+        else:
+            return x * F.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name is None:
+        return nn.Identity()
+
+    if isinstance(name, str):
+        if name == 'silu':
+            module = nn.SiLU(inplace=inplace)
+        elif name == 'relu':
+            module = nn.ReLU(inplace=inplace)
+        elif name == 'lrelu':
+            module = nn.LeakyReLU(0.1, inplace=inplace)
+        elif name == 'swish':
+            module = Swish(inplace=inplace)
+        elif name == 'hardsigmoid':
+            module = nn.Hardsigmoid(inplace=inplace)
+        else:
+            raise AttributeError('Unsupported act type: {}'.format(name))
+        return module
+    elif isinstance(name, nn.Module):
+        return name
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+
+
+class ConvBNLayer(nn.Module):
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(ch_out, )
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVGGBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', deploy=False):
+        super(RepVGGBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.deploy = deploy
+        self.in_channels = ch_in
+        self.groups = 1
+        if self.deploy is False:
+            self.rbr_dense = ConvBNLayer(
+                ch_in, ch_out, 3, stride=1, padding=1, act=None)
+            self.rbr_1x1 = ConvBNLayer(
+                ch_in, ch_out, 1, stride=1, padding=0, act=None)
+            # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None
+            self.rbr_identity = None
+        else:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        self.act = get_activation(act) if act is None or isinstance(
+            act, (str, dict)) else act
+
+    def forward(self, x):
+        if self.deploy:
+            print('----------deploy----------')
+            y = self.rbr_reparam(x)
+        else:
+            if self.rbr_identity is None:
+                y = self.rbr_dense(x) + self.rbr_1x1(x)
+            else:
+                y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x)
+
+        y = self.act(y)
+        return y
+
+    def switch_to_deploy(self):
+        print('switch')
+        if not hasattr(self, 'rbr_reparam'):
+            # return
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        print('switch')
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        # self.__delattr__(self.rbr_dense)
+        # self.__delattr__(self.rbr_1x1)
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        # if isinstance(branch, nn.Sequential):
+        if isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        # y = self.conv1(x)
+        y = self.conv2(x)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3_Reverse(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3_Reverse, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv2(x)
+        y = self.conv1(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class SPP(nn.Module):
+
+    def __init__(
+        self,
+        ch_in,
+        ch_out,
+        k,
+        pool_size,
+        act='swish',
+    ):
+        super(SPP, self).__init__()
+        self.pool = []
+        for i, size in enumerate(pool_size):
+            pool = nn.MaxPool2d(
+                kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
+            self.add_module('pool{}'.format(i), pool)
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
+
+    def forward(self, x):
+        outs = [x]
+
+        for pool in self.pool:
+            outs.append(pool(x))
+        y = torch.cat(outs, axis=1)
+
+        y = self.conv(y)
+        return y
+
+
+class CSPStage(nn.Module):
+
+    def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False):
+        super(CSPStage, self).__init__()
+
+        ch_mid = int(ch_out // 2)
+        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act)
+        self.convs = nn.Sequential()
+
+        next_ch_in = ch_mid
+        for i in range(n):
+            if block_fn == 'BasicBlock':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False))
+            elif block_fn == 'BasicBlock_3x3':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True))
+            elif block_fn == 'BasicBlock_3x3_Reverse':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3_Reverse(
+                        next_ch_in, ch_mid, act=act, shortcut=True))
+            else:
+                raise NotImplementedError
+            if i == (n - 1) // 2 and spp:
+                self.convs.add_module(
+                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
+            next_ch_in = ch_mid
+        # self.convs = nn.Sequential(*convs)
+        self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act)
+
+    def forward(self, x):
+        y1 = self.conv1(x)
+        y2 = self.conv2(x)
+
+        mid_out = [y1]
+        for conv in self.convs:
+            y2 = conv(y2)
+            mid_out.append(y2)
+        y = torch.cat(mid_out, axis=1)
+        y = self.conv3(y)
+        return y
diff --git a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
new file mode 100644
index 00000000..06966a4e
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
@@ -0,0 +1,205 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    elif name == 'identity':
+        module = nn.Identity()
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    '''Basic cell for rep-style block, including conv and bn'''
+    result = nn.Sequential()
+    result.add_module(
+        'conv',
+        nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+
+class RepVggBlock(nn.Module):
+    '''RepVggBlock is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    '''
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 deploy=False,
+                 use_se=False,
+                 act='relu',
+                 norm=None):
+        super(RepVggBlock, self).__init__()
+        """ Initialization of the class.
+        Args:
+            in_channels (int): Number of channels in the input image
+            out_channels (int): Number of channels produced by the convolution
+            kernel_size (int or tuple): Size of the convolving kernel
+            stride (int or tuple, optional): Stride of the convolution. Default: 1
+            padding (int or tuple, optional): Zero-padding added to both sides of
+                the input. Default: 1
+            dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+            groups (int, optional): Number of blocked connections from input
+                channels to output channels. Default: 1
+            padding_mode (string, optional): Default: 'zeros'
+            deploy: Whether to be deploy status or training status. Default: False
+            use_se: Whether to use se. Default: False
+        """
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        if isinstance(act, str):
+            self.nonlinearity = get_activation(act)
+        else:
+            self.nonlinearity = act
+
+        if use_se:
+            raise NotImplementedError('se block not supported yet')
+        else:
+            self.se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+                padding_mode=padding_mode)
+
+        else:
+            self.rbr_identity = None
+            self.rbr_dense = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups)
+            self.rbr_1x1 = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=padding_11,
+                groups=groups)
+
+    def forward(self, inputs):
+        '''Forward process'''
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(
+            in_channels=self.rbr_dense.conv.in_channels,
+            out_channels=self.rbr_dense.conv.out_channels,
+            kernel_size=self.rbr_dense.conv.kernel_size,
+            stride=self.rbr_dense.conv.stride,
+            padding=self.rbr_dense.conv.padding,
+            dilation=self.rbr_dense.conv.dilation,
+            groups=self.rbr_dense.conv.groups,
+            bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
diff --git a/modelscope/models/cv/tinynas_detection/core/utils.py b/modelscope/models/cv/tinynas_detection/core/utils.py
new file mode 100644
index 00000000..482f12fb
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torchvision
+
+__all__ = [
+    'filter_box',
+    'postprocess_airdet',
+    'bboxes_iou',
+    'matrix_iou',
+    'adjust_box_anns',
+    'xyxy2xywh',
+    'xyxy2cxcywh',
+]
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def filter_results(boxlist, num_classes, nms_thre):
+    boxes = boxlist.bbox
+    scores = boxlist.get_field('scores')
+    cls = boxlist.get_field('labels')
+    nms_out_index = torchvision.ops.batched_nms(
+        boxes,
+        scores,
+        cls,
+        nms_thre,
+    )
+    boxlist = boxlist[nms_out_index]
+
+    return boxlist
+
+
+def postprocess_airdet(prediction,
+                       num_classes,
+                       conf_thre=0.7,
+                       nms_thre=0.45,
+                       imgs=None):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 5:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+        detections = torch.cat(
+            (detections, scores[:, None], scores[:, None], labels[:, None]),
+            dim=1)
+
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py
new file mode 100644
index 00000000..615b13a8
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import os.path as osp
+import pickle
+
+import cv2
+import torch
+import torchvision
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import build_backbone
+from .head import build_head
+from .neck import build_neck
+from .utils import parse_config
+
+
+class SingleStageDetector(TorchModel):
+    """
+    The base class of single stage detector.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+        init model by cfg
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, 'airdet_s.py')
+        config = parse_config(config_path)
+        self.cfg = config
+        model_path = osp.join(model_dir, config.model.name)
+        label_map = osp.join(model_dir, config.model.class_map)
+        self.label_map = pickle.load(open(label_map, 'rb'))
+        self.size_divisible = config.dataset.size_divisibility
+        self.num_classes = config.model.head.num_classes
+        self.conf_thre = config.model.head.nms_conf_thre
+        self.nms_thre = config.model.head.nms_iou_thre
+
+        self.backbone = build_backbone(self.cfg.model.backbone)
+        self.neck = build_neck(self.cfg.model.neck)
+        self.head = build_head(self.cfg.model.head)
+
+        self.load_pretrain_model(model_path)
+
+    def load_pretrain_model(self, pretrain_model):
+
+        state_dict = torch.load(pretrain_model, map_location='cpu')['model']
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            k = k.replace('module.', '')
+            new_state_dict[k] = v
+        self.load_state_dict(new_state_dict, strict=True)
+
+    def inference(self, x):
+
+        if self.training:
+            return self.forward_train(x)
+        else:
+            return self.forward_eval(x)
+
+    def forward_train(self, x):
+
+        pass
+
+    def forward_eval(self, x):
+
+        x = self.backbone(x)
+        x = self.neck(x)
+        prediction = self.head(x)
+
+        return prediction
+
+    def preprocess(self, image):
+        image = torch.from_numpy(image).type(torch.float32)
+        image = image.permute(2, 0, 1)
+        shape = image.shape  # c, h, w
+        if self.size_divisible > 0:
+            import math
+            stride = self.size_divisible
+            shape = list(shape)
+            shape[1] = int(math.ceil(shape[1] / stride) * stride)
+            shape[2] = int(math.ceil(shape[2] / stride) * stride)
+            shape = tuple(shape)
+        pad_img = image.new(*shape).zero_()
+        pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
+        pad_img = pad_img.unsqueeze(0)
+
+        return pad_img
+
+    def postprocess(self, preds):
+        bboxes, scores, labels_idx = postprocess_gfocal(
+            preds, self.num_classes, self.conf_thre, self.nms_thre)
+        bboxes = bboxes.cpu().numpy()
+        scores = scores.cpu().numpy()
+        labels_idx = labels_idx.cpu().numpy()
+        labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]
+
+        return (bboxes, scores, labels)
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7):
+    assert prediction.shape[0] == 1
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 4:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+
+    return detections, scores, labels
diff --git a/modelscope/models/cv/tinynas_detection/head/__init__.py b/modelscope/models/cv/tinynas_detection/head/__init__.py
new file mode 100644
index 00000000..f870fae1
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .gfocal_v2_tiny import GFocalHead_Tiny
+
+
+def build_head(cfg):
+
+    head_cfg = copy.deepcopy(cfg)
+    name = head_cfg.pop('name')
+    if name == 'GFocalV2':
+        return GFocalHead_Tiny(**head_cfg)
+    else:
+        raise NotImplementedError
diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
new file mode 100644
index 00000000..41f35968
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -0,0 +1,361 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import functools
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..core.base_ops import BaseConv, DWConv
+
+
+class Scale(nn.Module):
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+def multi_apply(func, *args, **kwargs):
+
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def xyxy2CxCywh(xyxy, size=None):
+    x1 = xyxy[..., 0]
+    y1 = xyxy[..., 1]
+    x2 = xyxy[..., 2]
+    y2 = xyxy[..., 3]
+
+    cx = (x1 + x2) / 2
+    cy = (y1 + y2) / 2
+
+    w = x2 - x1
+    h = y2 - y1
+    if size is not None:
+        w = w.clamp(min=0, max=size[1])
+        h = h.clamp(min=0, max=size[0])
+    return torch.stack([cx, cy, w, h], axis=-1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        """
+        shape = x.size()
+        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
+        b, nb, ne, _ = x.size()
+        x = x.reshape(b * nb * ne, self.reg_max + 1)
+        y = self.project.type_as(x).unsqueeze(1)
+        x = torch.matmul(x, y).reshape(b, nb, 4)
+        return x
+
+
+class GFocalHead_Tiny(nn.Module):
+    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
+    Estimation for Dense Object Detection.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            stacked_convs=4,  # 4
+            feat_channels=256,
+            reg_max=12,
+            reg_topk=4,
+            reg_channels=64,
+            strides=[8, 16, 32],
+            add_mean=True,
+            norm='gn',
+            act='relu',
+            start_kernel_size=3,
+            conv_groups=1,
+            conv_type='BaseConv',
+            simOTA_cls_weight=1.0,
+            simOTA_iou_weight=3.0,
+            octbase=8,
+            simlqe=False,
+            **kwargs):
+        self.simlqe = simlqe
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.strides = strides
+        self.feat_channels = feat_channels if isinstance(feat_channels, list) \
+            else [feat_channels] * len(self.strides)
+
+        self.cls_out_channels = num_classes + 1  # add 1 for keep consistance with former models
+        # and will be deprecated in future.
+        self.stacked_convs = stacked_convs
+        self.conv_groups = conv_groups
+        self.reg_max = reg_max
+        self.reg_topk = reg_topk
+        self.reg_channels = reg_channels
+        self.add_mean = add_mean
+        self.total_dim = reg_topk
+        self.start_kernel_size = start_kernel_size
+
+        self.norm = norm
+        self.act = act
+        self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv
+
+        if add_mean:
+            self.total_dim += 1
+
+        super(GFocalHead_Tiny, self).__init__()
+        self.integral = Integral(self.reg_max)
+
+        self._init_layers()
+
+    def _build_not_shared_convs(self, in_channel, feat_channels):
+        self.relu = nn.ReLU(inplace=True)
+        cls_convs = nn.ModuleList()
+        reg_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = feat_channels if i > 0 else in_channel
+            kernel_size = 3 if i > 0 else self.start_kernel_size
+            cls_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+            reg_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+        if not self.simlqe:
+            conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
+        else:
+            conf_vector = [
+                nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
+            ]
+        conf_vector += [self.relu]
+        conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
+        reg_conf = nn.Sequential(*conf_vector)
+
+        return cls_convs, reg_convs, reg_conf
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_confs = nn.ModuleList()
+
+        for i in range(len(self.strides)):
+            cls_convs, reg_convs, reg_conf = self._build_not_shared_convs(
+                self.in_channels[i], self.feat_channels[i])
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+            self.reg_confs.append(reg_conf)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.gfl_reg = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def forward(self,
+                xin,
+                labels=None,
+                imgs=None,
+                conf_thre=0.05,
+                nms_thre=0.7):
+
+        # prepare labels during training
+        b, c, h, w = xin[0].shape
+        if labels is not None:
+            gt_bbox_list = []
+            gt_cls_list = []
+            for label in labels:
+                gt_bbox_list.append(label.bbox)
+                gt_cls_list.append((label.get_field('labels')
+                                    - 1).long())  # labels starts from 1
+
+        # prepare priors for label assignment and bbox decode
+        mlvl_priors_list = [
+            self.get_single_level_center_priors(
+                xin[i].shape[0],
+                xin[i].shape[-2:],
+                stride,
+                dtype=torch.float32,
+                device=xin[0].device) for i, stride in enumerate(self.strides)
+        ]
+        mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
+
+        # forward for bboxes and classification prediction
+        cls_scores, bbox_preds = multi_apply(
+            self.forward_single,
+            xin,
+            self.cls_convs,
+            self.reg_convs,
+            self.gfl_cls,
+            self.gfl_reg,
+            self.reg_confs,
+            self.scales,
+        )
+        flatten_cls_scores = torch.cat(cls_scores, dim=1)
+        flatten_bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        # calculating losses or bboxes decoded
+        if self.training:
+            loss = self.loss(flatten_cls_scores, flatten_bbox_preds,
+                             gt_bbox_list, gt_cls_list, mlvl_priors)
+            return loss
+        else:
+            output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds,
+                                     mlvl_priors)
+            return output
+
+    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg,
+                       reg_conf, scale):
+        """Forward feature of a single scale level.
+
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_conv in cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        bbox_pred = scale(gfl_reg(reg_feat)).float()
+        N, C, H, W = bbox_pred.size()
+        prob = F.softmax(
+            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
+        if not self.simlqe:
+            prob_topk, _ = prob.topk(self.reg_topk, dim=2)
+
+            if self.add_mean:
+                stat = torch.cat(
+                    [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
+            else:
+                stat = prob_topk
+
+            quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
+        else:
+            quality_score = reg_conf(
+                bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))
+
+        cls_score = gfl_cls(cls_feat).sigmoid() * quality_score
+
+        flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
+        flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
+        return flatten_cls_score, flatten_bbox_pred
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+
+        h, w = featmap_size
+        x_range = (torch.arange(0, int(w), dtype=dtype,
+                                device=device)) * stride
+        y_range = (torch.arange(0, int(h), dtype=dtype,
+                                device=device)) * stride
+
+        x = x_range.repeat(h, 1)
+        y = y_range.unsqueeze(-1).repeat(1, w)
+
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        priors = torch.stack([x, y, strides, strides], dim=-1)
+
+        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
+
+    def sample(self, assign_result, gt_bboxes):
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.numel() == 0
+            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def get_bboxes(self,
+                   cls_preds,
+                   reg_preds,
+                   mlvl_center_priors,
+                   img_meta=None):
+
+        dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
+        bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)
+
+        res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)
+
+        return res
diff --git a/modelscope/models/cv/tinynas_detection/neck/__init__.py b/modelscope/models/cv/tinynas_detection/neck/__init__.py
new file mode 100644
index 00000000..3c418c29
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .giraffe_fpn import GiraffeNeck
+from .giraffe_fpn_v2 import GiraffeNeckV2
+
+
+def build_neck(cfg):
+    neck_cfg = copy.deepcopy(cfg)
+    name = neck_cfg.pop('name')
+    if name == 'GiraffeNeck':
+        return GiraffeNeck(**neck_cfg)
+    elif name == 'GiraffeNeckV2':
+        return GiraffeNeckV2(**neck_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
new file mode 100644
index 00000000..289fdfd2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
@@ -0,0 +1,235 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import collections
+import itertools
+import os
+
+import networkx as nx
+from omegaconf import OmegaConf
+
+Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])
+
+
+def get_graph_info(graph):
+    input_nodes = []
+    output_nodes = []
+    Nodes = []
+    for node in range(graph.number_of_nodes()):
+        tmp = list(graph.neighbors(node))
+        tmp.sort()
+        type = -1
+        if node < tmp[0]:
+            input_nodes.append(node)
+            type = 0
+        if node > tmp[-1]:
+            output_nodes.append(node)
+            type = 1
+        Nodes.append(Node(node, [n for n in tmp if n < node], type))
+    return Nodes, input_nodes, output_nodes
+
+
+def nodeid_trans(id, cur_level, num_levels):
+    if id % 2 == 1:
+        gap = int(((id + 1) // 2) * num_levels * 2)
+    else:
+        a = (num_levels - cur_level) * 2 - 1
+        b = ((id + 1) // 2) * num_levels * 2
+        gap = int(a + b)
+    return cur_level + gap
+
+
+def gen_log2n_graph_file(log2n_graph_file, depth_multiplier):
+    f = open(log2n_graph_file, 'w')
+    for i in range(depth_multiplier):
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                f.write('%d,%d\n' % (i - j, i))
+    f.close()
+
+
+def get_log2n_graph(depth_multiplier):
+    nodes = []
+    connnections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                connnections.append((i - j, i))
+    return nodes, connnections
+
+
+def get_dense_graph(depth_multiplier):
+    nodes = []
+    connections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in range(i):
+            connections.append((j, i))
+    return nodes, connections
+
+
+def giraffeneck_config(min_level,
+                       max_level,
+                       weight_method=None,
+                       depth_multiplier=5,
+                       with_backslash=False,
+                       with_slash=False,
+                       with_skip_connect=False,
+                       skip_connect_type='dense'):
+    """Graph config with log2n merge and panet"""
+    if skip_connect_type == 'dense':
+        nodes, connections = get_dense_graph(depth_multiplier)
+    elif skip_connect_type == 'log2n':
+        nodes, connections = get_log2n_graph(depth_multiplier)
+    graph = nx.Graph()
+    graph.add_nodes_from(nodes)
+    graph.add_edges_from(connections)
+
+    drop_node = []
+    nodes, input_nodes, output_nodes = get_graph_info(graph)
+
+    weight_method = weight_method or 'fastattn'
+
+    num_levels = max_level - min_level + 1
+    node_ids = {min_level + i: [i] for i in range(num_levels)}
+    node_ids_per_layer = {}
+
+    pnodes = {}
+
+    def update_drop_node(new_id, input_offsets):
+        if new_id not in drop_node:
+            new_id = new_id
+        else:
+            while new_id in drop_node:
+                if new_id in pnodes:
+                    for n in pnodes[new_id]['inputs_offsets']:
+                        if n not in input_offsets and n not in drop_node:
+                            input_offsets.append(n)
+                new_id = new_id - 1
+        if new_id not in input_offsets:
+            input_offsets.append(new_id)
+
+    # top-down layer
+    for i in range(max_level, min_level - 1, -1):
+        node_ids_per_layer[i] = []
+        for id, node in enumerate(nodes):
+            input_offsets = []
+            if id in input_nodes:
+                input_offsets.append(node_ids[i][0])
+            else:
+                if with_skip_connect:
+                    for input_id in node.inputs:
+                        new_id = nodeid_trans(input_id, i - min_level,
+                                              num_levels)
+                        update_drop_node(new_id, input_offsets)
+
+            # add top2down
+            new_id = nodeid_trans(id, i - min_level, num_levels)
+
+            # add backslash node
+            def cal_backslash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 0:  # even
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # odd
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add slash node
+            def cal_slash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 1:  # odd
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # even
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add last node
+            last = new_id - 1
+            update_drop_node(last, input_offsets)
+
+            if with_backslash:
+                backslash = cal_backslash_node(new_id)
+                if backslash != -1 and backslash not in input_offsets:
+                    input_offsets.append(backslash)
+
+            if with_slash:
+                slash = cal_slash_node(new_id)
+                if slash != -1 and slash not in input_offsets:
+                    input_offsets.append(slash)
+
+            if new_id in drop_node:
+                input_offsets = []
+
+            pnodes[new_id] = {
+                'reduction': 1 << i,
+                'inputs_offsets': input_offsets,
+                'weight_method': weight_method,
+                'is_out': 0,
+            }
+
+        input_offsets = []
+        for out_id in output_nodes:
+            new_id = nodeid_trans(out_id, i - min_level, num_levels)
+            input_offsets.append(new_id)
+
+        pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = {
+            'reduction': 1 << i,
+            'inputs_offsets': input_offsets,
+            'weight_method': weight_method,
+            'is_out': 1,
+        }
+
+    pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0]))
+    return pnodes
+
+
+def get_graph_config(fpn_name,
+                     min_level=3,
+                     max_level=7,
+                     weight_method='concat',
+                     depth_multiplier=5,
+                     with_backslash=False,
+                     with_slash=False,
+                     with_skip_connect=False,
+                     skip_connect_type='dense'):
+    name_to_config = {
+        'giraffeneck':
+        giraffeneck_config(
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type),
+    }
+    return name_to_config[fpn_name]
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
new file mode 100644
index 00000000..b7087779
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
@@ -0,0 +1,661 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import logging
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm import create_model
+from timm.models.layers import (Swish, create_conv2d, create_pool2d,
+                                get_act_layer)
+
+from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
+from .giraffe_config import get_graph_config
+
+_ACT_LAYER = Swish
+
+
+class SequentialList(nn.Sequential):
+    """ This module exists to work around torchscript typing issues list -> list"""
+
+    def __init__(self, *args):
+        super(SequentialList, self).__init__(*args)
+
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
+class ConvBnAct2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(ConvBnAct2d, self).__init__()
+
+        self.conv = create_conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            bias=bias)
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+    """ Separable Conv
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 channel_multiplier=1.0,
+                 pw_kernel_size=1,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(SeparableConv2d, self).__init__()
+        self.conv_dw = create_conv2d(
+            in_channels,
+            int(in_channels * channel_multiplier),
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier),
+            out_channels,
+            pw_kernel_size,
+            padding=padding,
+            bias=bias)
+
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def _init_weight(
+    m,
+    n='',
+):
+    """ Weight initialization as per Tensorflow official implementations.
+    """
+
+    def _fan_in_out(w, groups=1):
+        dimensions = w.dim()
+        if dimensions < 2:
+            raise ValueError(
+                'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions'
+            )
+        num_input_fmaps = w.size(1)
+        num_output_fmaps = w.size(0)
+        receptive_field_size = 1
+        if w.dim() > 2:
+            receptive_field_size = w[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+        fan_out //= groups
+        return fan_in, fan_out
+
+    def _glorot_uniform(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., (fan_in + fan_out) / 2.)  # fan avg
+        limit = math.sqrt(3.0 * gain)
+        w.data.uniform_(-limit, limit)
+
+    def _variance_scaling(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., fan_in)  # fan in
+        std = math.sqrt(gain)
+        w.data.normal_(std=std)
+
+    if isinstance(m, SeparableConv2d):
+        if 'box_net' in n or 'class_net' in n:
+            _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _variance_scaling(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv_pw.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _glorot_uniform(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                m.conv_pw.bias.data.zero_()
+    elif isinstance(m, ConvBnAct2d):
+        if 'box_net' in n or 'class_net' in n:
+            m.conv.weight.data.normal_(std=.01)
+            if m.conv.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv.weight)
+            if m.conv.bias is not None:
+                m.conv.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+def _init_weight_alt(
+    m,
+    n='',
+):
+    """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition
+    NOTE: this will likely be removed after some experimentation
+    """
+    if isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        fan_out //= m.groups
+        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            if 'class_net.predict' in n:
+                m.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+            else:
+                m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+class Interpolate2d(nn.Module):
+    r"""Resamples a 2d Image
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
+    """
+    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name']
+    name: str
+    size: Optional[Union[int, Tuple[int, int]]]
+    scale_factor: Optional[Union[float, Tuple[float, float]]]
+    mode: str
+    align_corners: Optional[bool]
+
+    def __init__(self,
+                 size: Optional[Union[int, Tuple[int, int]]] = None,
+                 scale_factor: Optional[Union[float, Tuple[float,
+                                                           float]]] = None,
+                 mode: str = 'nearest',
+                 align_corners: bool = False) -> None:
+        super(Interpolate2d, self).__init__()
+        self.name = type(self).__name__
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = None if mode == 'nearest' else align_corners
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.interpolate(
+            input,
+            self.size,
+            self.scale_factor,
+            self.mode,
+            self.align_corners,
+            recompute_scale_factor=False)
+
+
+class ResampleFeatureMap(nn.Sequential):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 reduction_ratio=1.,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False):
+        super(ResampleFeatureMap, self).__init__()
+        downsample = downsample or 'max'
+        upsample = upsample or 'nearest'
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.reduction_ratio = reduction_ratio
+        self.conv_after_downsample = conv_after_downsample
+
+        conv = None
+        if in_channels != out_channels:
+            conv = ConvBnAct2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                padding=pad_type,
+                norm_layer=norm_layer if apply_bn else None,
+                bias=not apply_bn or redundant_bias,
+                act_layer=None)
+
+        if reduction_ratio > 1:
+            if conv is not None and not self.conv_after_downsample:
+                self.add_module('conv', conv)
+            if downsample in ('max', 'avg'):
+                stride_size = int(reduction_ratio)
+                downsample = create_pool2d(
+                    downsample,
+                    kernel_size=stride_size + 1,
+                    stride=stride_size,
+                    padding=pad_type)
+            else:
+                downsample = Interpolate2d(
+                    scale_factor=1. / reduction_ratio, mode=downsample)
+            self.add_module('downsample', downsample)
+            if conv is not None and self.conv_after_downsample:
+                self.add_module('conv', conv)
+        else:
+            if conv is not None:
+                self.add_module('conv', conv)
+            if reduction_ratio < 1:
+                scale = int(1 // reduction_ratio)
+                self.add_module(
+                    'upsample',
+                    Interpolate2d(scale_factor=scale, mode=upsample))
+
+
+class GiraffeCombine(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 fpn_channels,
+                 inputs_offsets,
+                 target_reduction,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_resample_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False,
+                 weight_method='attn'):
+        super(GiraffeCombine, self).__init__()
+        self.inputs_offsets = inputs_offsets
+        self.weight_method = weight_method
+
+        self.resample = nn.ModuleDict()
+        reduction_base = feature_info[0]['reduction']
+
+        target_channels_idx = int(
+            math.log(target_reduction // reduction_base, 2))
+        for idx, offset in enumerate(inputs_offsets):
+            if offset < len(feature_info):
+                in_channels = feature_info[offset]['num_chs']
+                input_reduction = feature_info[offset]['reduction']
+            else:
+                node_idx = offset
+                input_reduction = fpn_config[node_idx]['reduction']
+                # in_channels = fpn_config[node_idx]['num_chs']
+                input_channels_idx = int(
+                    math.log(input_reduction // reduction_base, 2))
+                in_channels = feature_info[input_channels_idx]['num_chs']
+
+            reduction_ratio = target_reduction / input_reduction
+            if weight_method == 'concat':
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    in_channels,
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+            else:
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    fpn_channels[target_channels_idx],
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+
+        if weight_method == 'attn' or weight_method == 'fastattn':
+            self.edge_weights = nn.Parameter(
+                torch.ones(len(inputs_offsets)), requires_grad=True)  # WSM
+        else:
+            self.edge_weights = None
+
+    def forward(self, x: List[torch.Tensor]):
+        dtype = x[0].dtype
+        nodes = []
+        if len(self.inputs_offsets) == 0:
+            return None
+        for offset, resample in zip(self.inputs_offsets,
+                                    self.resample.values()):
+            input_node = x[offset]
+            input_node = resample(input_node)
+            nodes.append(input_node)
+
+        if self.weight_method == 'attn':
+            normalized_weights = torch.softmax(
+                self.edge_weights.to(dtype=dtype), dim=0)
+            out = torch.stack(nodes, dim=-1) * normalized_weights
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'fastattn':
+            edge_weights = nn.functional.relu(
+                self.edge_weights.to(dtype=dtype))
+            weights_sum = torch.sum(edge_weights)
+            weights_norm = weights_sum + 0.0001
+            out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm
+                               for i in range(len(nodes))],
+                              dim=-1)
+
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'sum':
+            out = torch.stack(nodes, dim=-1)
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'concat':
+            out = torch.cat(nodes, dim=1)
+        else:
+            raise ValueError('unknown weight_method {}'.format(
+                self.weight_method))
+        return out
+
+
+class GiraffeNode(nn.Module):
+    """ A simple wrapper used in place of nn.Sequential for torchscript typing
+    Handles input type List[Tensor] -> output type Tensor
+    """
+
+    def __init__(self, combine: nn.Module, after_combine: nn.Module):
+        super(GiraffeNode, self).__init__()
+        self.combine = combine
+        self.after_combine = after_combine
+
+    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+        combine_feat = self.combine(x)
+        if combine_feat is None:
+            return None
+        else:
+            return self.after_combine(combine_feat)
+
+
+class GiraffeLayer(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 inner_fpn_channels,
+                 outer_fpn_channels,
+                 num_levels=5,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER,
+                 apply_resample_bn=False,
+                 conv_after_downsample=True,
+                 conv_bn_relu_pattern=False,
+                 separable_conv=True,
+                 redundant_bias=False,
+                 merge_type='conv'):
+        super(GiraffeLayer, self).__init__()
+        self.num_levels = num_levels
+        self.conv_bn_relu_pattern = False
+
+        self.feature_info = {}
+        for idx, feat in enumerate(feature_info):
+            self.feature_info[idx] = feat
+
+        self.fnode = nn.ModuleList()
+        reduction_base = feature_info[0]['reduction']
+        for i, fnode_cfg in fpn_config.items():
+            logging.debug('fnode {} : {}'.format(i, fnode_cfg))
+
+            if fnode_cfg['is_out'] == 1:
+                fpn_channels = outer_fpn_channels
+            else:
+                fpn_channels = inner_fpn_channels
+
+            reduction = fnode_cfg['reduction']
+            fpn_channels_idx = int(math.log(reduction // reduction_base, 2))
+            combine = GiraffeCombine(
+                self.feature_info,
+                fpn_config,
+                fpn_channels,
+                tuple(fnode_cfg['inputs_offsets']),
+                target_reduction=reduction,
+                pad_type=pad_type,
+                downsample=downsample,
+                upsample=upsample,
+                norm_layer=norm_layer,
+                apply_resample_bn=apply_resample_bn,
+                conv_after_downsample=conv_after_downsample,
+                redundant_bias=redundant_bias,
+                weight_method=fnode_cfg['weight_method'])
+
+            after_combine = nn.Sequential()
+
+            in_channels = 0
+            out_channels = 0
+            for input_offset in fnode_cfg['inputs_offsets']:
+                in_channels += self.feature_info[input_offset]['num_chs']
+
+            out_channels = fpn_channels[fpn_channels_idx]
+
+            if merge_type == 'csp':
+                after_combine.add_module(
+                    'CspLayer',
+                    CSPLayer(
+                        in_channels,
+                        out_channels,
+                        2,
+                        shortcut=True,
+                        depthwise=False,
+                        act='silu'))
+            elif merge_type == 'shuffle':
+                after_combine.add_module(
+                    'shuffleBlock', ShuffleBlock(in_channels, in_channels))
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+            elif merge_type == 'conv':
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+                conv_kwargs = dict(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=pad_type,
+                    bias=False,
+                    norm_layer=norm_layer,
+                    act_layer=act_layer)
+                if not conv_bn_relu_pattern:
+                    conv_kwargs['bias'] = redundant_bias
+                    conv_kwargs['act_layer'] = None
+                    after_combine.add_module('act', act_layer(inplace=True))
+                after_combine.add_module(
+                    'conv',
+                    SeparableConv2d(**conv_kwargs)
+                    if separable_conv else ConvBnAct2d(**conv_kwargs))
+
+            self.fnode.append(
+                GiraffeNode(combine=combine, after_combine=after_combine))
+            self.feature_info[i] = dict(
+                num_chs=fpn_channels[fpn_channels_idx], reduction=reduction)
+
+        self.out_feature_info = []
+        out_node = list(self.feature_info.keys())[-num_levels::]
+        for i in out_node:
+            self.out_feature_info.append(self.feature_info[i])
+
+        self.feature_info = self.out_feature_info
+
+    def forward(self, x: List[torch.Tensor]):
+        for fn in self.fnode:
+            x.append(fn(x))
+        return x[-self.num_levels::]
+
+
+class GiraffeNeck(nn.Module):
+
+    def __init__(self, min_level, max_level, num_levels, norm_layer,
+                 norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels,
+                 out_fpn_channels, weight_method, depth_multiplier,
+                 width_multiplier, with_backslash, with_slash,
+                 with_skip_connect, skip_connect_type, separable_conv,
+                 feature_info, merge_type, pad_type, downsample_type,
+                 upsample_type, apply_resample_bn, conv_after_downsample,
+                 redundant_bias, conv_bn_relu_pattern, alternate_init):
+        super(GiraffeNeck, self).__init__()
+
+        self.num_levels = num_levels
+        self.min_level = min_level
+        self.in_features = [0, 1, 2, 3, 4, 5,
+                            6][self.min_level - 1:self.min_level - 1
+                               + num_levels]
+        self.alternate_init = alternate_init
+        norm_layer = norm_layer or nn.BatchNorm2d
+        if norm_kwargs:
+            norm_layer = partial(norm_layer, **norm_kwargs)
+        act_layer = get_act_layer(act_type) or _ACT_LAYER
+        fpn_config = fpn_config or get_graph_config(
+            fpn_name,
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type)
+
+        # width scale
+        for i in range(len(fpn_channels)):
+            fpn_channels[i] = int(fpn_channels[i] * width_multiplier)
+
+        self.resample = nn.ModuleDict()
+        for level in range(num_levels):
+            if level < len(feature_info):
+                in_chs = feature_info[level]['num_chs']
+                reduction = feature_info[level]['reduction']
+            else:
+                # Adds a coarser level by downsampling the last feature map
+                reduction_ratio = 2
+                self.resample[str(level)] = ResampleFeatureMap(
+                    in_channels=in_chs,
+                    out_channels=feature_info[level - 1]['num_chs'],
+                    pad_type=pad_type,
+                    downsample=downsample_type,
+                    upsample=upsample_type,
+                    norm_layer=norm_layer,
+                    reduction_ratio=reduction_ratio,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias,
+                )
+                in_chs = feature_info[level - 1]['num_chs']
+                reduction = int(reduction * reduction_ratio)
+                feature_info.append(dict(num_chs=in_chs, reduction=reduction))
+
+        self.cell = SequentialList()
+        logging.debug('building giraffeNeck')
+        giraffe_layer = GiraffeLayer(
+            feature_info=feature_info,
+            fpn_config=fpn_config,
+            inner_fpn_channels=fpn_channels,
+            outer_fpn_channels=out_fpn_channels,
+            num_levels=num_levels,
+            pad_type=pad_type,
+            downsample=downsample_type,
+            upsample=upsample_type,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            separable_conv=separable_conv,
+            apply_resample_bn=apply_resample_bn,
+            conv_after_downsample=conv_after_downsample,
+            conv_bn_relu_pattern=conv_bn_relu_pattern,
+            redundant_bias=redundant_bias,
+            merge_type=merge_type)
+        self.cell.add_module('giraffeNeck', giraffe_layer)
+        feature_info = giraffe_layer.feature_info
+
+    def init_weights(self, pretrained=False):
+        for n, m in self.named_modules():
+            if 'backbone' not in n:
+                if self.alternate_init:
+                    _init_weight_alt(m, n)
+                else:
+                    _init_weight(m, n)
+
+    def forward(self, x: List[torch.Tensor]):
+        if type(x) is tuple:
+            x = list(x)
+        x = [x[f] for f in self.in_features]
+        for resample in self.resample.values():
+            x.append(resample(x[-1]))
+        x = self.cell(x)
+        return x
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
new file mode 100644
index 00000000..b710572f
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -0,0 +1,203 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import BaseConv, CSPLayer, DWConv
+from ..core.neck_ops import CSPStage
+
+
+class GiraffeNeckV2(nn.Module):
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=[2, 3, 4],
+        in_channels=[256, 512, 1024],
+        out_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+        spp=True,
+        reparam_mode=True,
+        block_name='BasicBlock',
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        reparam_mode = reparam_mode
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+        # node x3: input x0, x1
+        self.bu_conv13 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_3 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_3 = CSPLayer(
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x4: input x1, x2, x3
+        self.bu_conv24 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_4 = CSPStage(
+                block_name,
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_4 = CSPLayer(
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x5: input x2, x4
+        if reparam_mode:
+            self.merge_5 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_5 = CSPLayer(
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x7: input x4, x5
+        self.bu_conv57 = Conv(
+            int(out_channels[0] * width),
+            int(out_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_7 = CSPStage(
+                block_name,
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_7 = CSPLayer(
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x6: input x3, x4, x7
+        self.bu_conv46 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.bu_conv76 = Conv(
+            int(out_channels[1] * width),
+            int(out_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_6 = CSPStage(
+                block_name,
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_6 = CSPLayer(
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+    def init_weights(self):
+        pass
+
+    def forward(self, out_features):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        # node x3
+        x13 = self.bu_conv13(x1)
+        x3 = torch.cat([x0, x13], 1)
+        x3 = self.merge_3(x3)
+
+        # node x4
+        x34 = self.upsample(x3)
+        x24 = self.bu_conv24(x2)
+        x4 = torch.cat([x1, x24, x34], 1)
+        x4 = self.merge_4(x4)
+
+        # node x5
+        x45 = self.upsample(x4)
+        x5 = torch.cat([x2, x45], 1)
+        x5 = self.merge_5(x5)
+
+        # node x7
+        x57 = self.bu_conv57(x5)
+        x7 = torch.cat([x4, x57], 1)
+        x7 = self.merge_7(x7)
+
+        # node x6
+        x46 = self.bu_conv46(x4)
+        x76 = self.bu_conv76(x7)
+        x6 = torch.cat([x3, x46, x76], 1)
+        x6 = self.merge_6(x6)
+
+        outputs = (x5, x7, x6)
+        return outputs
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_detector.py b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
new file mode 100644
index 00000000..e6f144df
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .detector import SingleStageDetector
+
+
+@MODELS.register_module(
+    Tasks.image_object_detection, module_name=Models.tinynas_detection)
+class TinynasDetector(SingleStageDetector):
+
+    def __init__(self, model_dir, *args, **kwargs):
+
+        super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)
diff --git a/modelscope/models/cv/tinynas_detection/utils.py b/modelscope/models/cv/tinynas_detection/utils.py
new file mode 100644
index 00000000..d67d3a36
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import importlib
+import os
+import sys
+from os.path import dirname, join
+
+
+def get_config_by_file(config_file):
+    try:
+        sys.path.append(os.path.dirname(config_file))
+        current_config = importlib.import_module(
+            os.path.basename(config_file).split('.')[0])
+        exp = current_config.Config()
+    except Exception:
+        raise ImportError(
+            "{} doesn't contains class named 'Config'".format(config_file))
+    return exp
+
+
+def parse_config(config_file):
+    """
+    get config object by file.
+    Args:
+        config_file (str): file path of config.
+    """
+    assert (config_file is not None), 'plz provide config file'
+    if config_file is not None:
+        return get_config_by_file(config_file)
diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
new file mode 100644
index 00000000..b2063629
--- /dev/null
+++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.tinynas_detection)
+class TinynasDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.model.to(self.device)
+        self.model.eval()
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        self.img = img
+        img = img.astype(np.float)
+        img = self.model.preprocess(img)
+        result = {'img': img.to(self.device)}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {'data': outputs}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        bboxes, scores, labels = self.model.postprocess(inputs['data'])
+        if bboxes is None:
+            return None
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+        return outputs
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
new file mode 100644
index 00000000..6b2ecd0b
--- /dev/null
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -0,0 +1,20 @@
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TinynasObjectDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        tinynas_object_detection = pipeline(
+            Tasks.image_object_detection, model='damo/cv_tinynas_detection')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1a22fa02228f0884bcb48bdaccc4f90a24c85009 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Fri, 2 Sep 2022 14:06:08 +0800
Subject: [PATCH 042/175] fix trainer unittest         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9970626

    * fix trainer unittest
---
 tests/trainers/test_trainer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 17fa97f9..86909f74 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -17,7 +17,7 @@ from modelscope.metrics.builder import MetricKeys
 from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.trainers.base import DummyTrainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
 
@@ -67,6 +67,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_0(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -141,6 +142,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_1(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -201,6 +203,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_with_default_config(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {
@@ -319,6 +322,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_with_iters_per_epoch(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {

From 4d3716cf4ebd0efc814818709234f93eef8e73c5 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Fri, 2 Sep 2022 14:14:47 +0800
Subject: [PATCH 043/175] =?UTF-8?q?[to=20#42322933]=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E6=8C=87=E5=AF=BC=E7=9A=84=E8=AF=AD=E4=B9=89=E5=88=86=E5=89=B2?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

文本指导的语义分割模型，根据输入的文本信息，讲图像中对应文本描述的物体分割出来。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9942863
---
 data/test/images/text_driven_segmentation.jpg |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/text_driven_segmentation/__init__.py   |   1 +
 .../cv/text_driven_segmentation/clip.py       | 170 ++++++
 .../cv/text_driven_segmentation/lseg_base.py  |  28 +
 .../text_driven_segmentation/lseg_blocks.py   | 334 +++++++++++
 .../cv/text_driven_segmentation/lseg_model.py | 107 ++++
 .../cv/text_driven_segmentation/lseg_net.py   | 197 +++++++
 .../cv/text_driven_segmentation/lseg_vit.py   | 543 ++++++++++++++++++
 .../cv/text_driven_segmentation/model.py      | 458 +++++++++++++++
 .../simple_tokenizer.py                       | 156 +++++
 modelscope/outputs.py                         |   7 +
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/cv/__init__.py           |   3 +
 .../cv/text_driven_segmentation_pipleline.py  |  51 ++
 modelscope/utils/constant.py                  |   1 +
 .../test_text_driven_segmentation.py          |  28 +
 17 files changed, 2092 insertions(+)
 create mode 100644 data/test/images/text_driven_segmentation.jpg
 create mode 100644 modelscope/models/cv/text_driven_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/clip.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_base.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_model.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_net.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_vit.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/model.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
 create mode 100644 modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
 create mode 100644 tests/pipelines/test_text_driven_segmentation.py

diff --git a/data/test/images/text_driven_segmentation.jpg b/data/test/images/text_driven_segmentation.jpg
new file mode 100644
index 00000000..e3320b1f
--- /dev/null
+++ b/data/test/images/text_driven_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4
+size 67861
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fd653bac..3225710a 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -29,6 +29,7 @@ class Models(object):
     video_summarization = 'pgl-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
+    text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
 
     # EasyCV models
@@ -143,6 +144,7 @@ class Pipelines(object):
     video_summarization = 'googlenet_pgl_video_summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
 
     # nlp tasks
diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py
new file mode 100644
index 00000000..46daad78
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/__init__.py
@@ -0,0 +1 @@
+from .lseg_base import TextDrivenSegmentation
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
new file mode 100644
index 00000000..440cccea
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -0,0 +1,170 @@
+""" CLIP
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, List, Union
+
+import torch
+from PIL import Image
+from pkg_resources import packaging
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+if packaging.version.parse(
+        torch.__version__) < packaging.version.parse('1.7.1'):
+    warnings.warn('PyTorch version 1.7.1 or higher is recommended')
+__all__ = ['load', 'tokenize']
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
+def load(name: str,
+         device: Union[str, torch.device] = 'cuda'
+         if torch.cuda.is_available() else 'cpu',
+         jit: bool = False,
+         root: str = None):
+
+    if not jit:
+        model = build_model().to(device)
+        if str(device) == 'cpu':
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, 'graph') else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [
+                            1, 2
+                    ]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
+
+
+def tokenize(
+        _tokenizer,
+        texts: Union[str, List[str]],
+        context_length: int = 77,
+        truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<|startoftext|>']
+    eot_token = _tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    if packaging.version.parse(
+            torch.__version__) < packaging.version.parse('1.8.0'):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
new file mode 100644
index 00000000..20915396
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -0,0 +1,28 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import torch
+import torch.nn as nn
+
+from .lseg_net import LSeg
+
+
+class TextDrivenSegmentation(nn.Module):
+
+    def __init__(self, model_dir):
+        super(TextDrivenSegmentation, self).__init__()
+        self.net = LSeg(model_dir=model_dir)
+        self.model_dir = model_dir
+
+    def forward(self, img, txt_list):
+        b = img.size()[0]
+        batch_name_list = txt_list
+        xout_list = []
+        for i in range(b):
+            labelset = ['others', batch_name_list[i]]
+            xout = self.net(img[i:i + 1], labelset=labelset)
+            xout_list.append(xout)
+        score_map = torch.cat(xout_list, dim=0)
+        return score_map
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
new file mode 100644
index 00000000..cb550ab7
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -0,0 +1,334 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import torch
+import torch.nn as nn
+
+from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit
+
+
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained=True,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+    enable_attention_hooks=False,
+):
+    if backbone == 'clip_vitl16_384':
+        clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch([256, 512, 1024, 1024],
+                                features,
+                                groups=groups,
+                                expand=expand)
+    else:
+        raise NotImplementedError(f"Backbone '{backbone}' not implemented")
+
+    return clip_pretrained, pretrained, scratch
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    return scratch
+
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return output
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+        return output
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
new file mode 100644
index 00000000..1d7ebdd1
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -0,0 +1,107 @@
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.text_driven_segmentation import \
+    TextDrivenSegmentation
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['TextDrivenSeg']
+
+
+@MODELS.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Models.text_driven_segmentation)
+class TextDrivenSeg(TorchModel):
+    """ text driven segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = TextDrivenSegmentation(model_dir=model_dir)
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
+        else:
+            device_id = -1
+            logger.info('Use CPU for inference')
+        self.device_id = device_id
+
+    def preprocess(self, img, size=640):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image, text):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=640)
+        pred = self.inference(image_tensor, text)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640)
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image, text):
+        """
+        image should be tensor, 1 * 3 * 640 * 640
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device), [text])
+            output = F.interpolate(output, size=(640, 640), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+            del output
+        return pred
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
new file mode 100644
index 00000000..1a558c5c
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -0,0 +1,197 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from . import clip
+from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
+                          Interpolate, _make_encoder, forward_vit)
+from .simple_tokenizer import SimpleTokenizer
+
+
+class depthwise_clipseg_conv(nn.Module):
+
+    def __init__(self):
+        super(depthwise_clipseg_conv, self).__init__()
+        self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)
+
+    def depthwise_clipseg(self, x, channels):
+        x = torch.cat(
+            [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)],
+            dim=1)
+        return x
+
+    def forward(self, x):
+        channels = x.shape[1]
+        out = self.depthwise_clipseg(x, channels)
+        return out
+
+
+class depthwise_conv(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1):
+        super(depthwise_conv, self).__init__()
+        self.depthwise = nn.Conv2d(
+            1, 1, kernel_size=kernel_size, stride=stride, padding=padding)
+
+    def forward(self, x):
+        # support for 4D tensor with NCHW
+        C, H, W = x.shape[1:]
+        x = x.reshape(-1, 1, H, W)
+        x = self.depthwise(x)
+        x = x.view(-1, C, H, W)
+        return x
+
+
+class depthwise_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(depthwise_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        x = self.depthwise(x)
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class bottleneck_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(bottleneck_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        sum_layer = x.max(dim=1, keepdim=True)[0]
+        x = self.depthwise(x)
+        x = x + sum_layer
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class BaseModel(torch.nn.Module):
+
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+
+        self.load_state_dict(parameters)
+
+
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        activation=nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+
+
+class LSeg(BaseModel):
+
+    def __init__(
+        self,
+        features=256,
+        backbone='clip_vitl16_384',
+        readout='project',
+        use_bn=True,
+        model_dir=None,
+    ):
+        super(LSeg, self).__init__()
+        hooks = {
+            'clip_vitl16_384': [5, 11, 17, 23],
+        }
+
+        # Instantiate backbone and reassemble blocks
+        self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        self.logit_scale = nn.Parameter(torch.ones([])
+                                        * np.log(1 / 0.07)).exp()
+        self.out_c = 512
+        self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)
+
+        self.scratch.output_conv = nn.Sequential(
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True), )
+
+        self.tau = 0.07
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+    def forward(self, x, labelset=''):
+        text = clip.tokenize(self.tokenizer, labelset)
+
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+        text = text.to(x.device)
+        text_features = self.clip_pretrained.encode_text(text)
+
+        image_features = self.scratch.head1(path_1)
+
+        imshape = image_features.shape
+        image_features = image_features.permute(0, 2, 3,
+                                                1).reshape(-1, self.out_c)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+
+        logits_per_image = image_features @ text_features.t() / self.tau
+
+        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3],
+                                            -1).permute(0, 3, 1, 2)
+
+        out = self.scratch.output_conv(out)
+
+        return out
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
new file mode 100644
index 00000000..be2813c2
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -0,0 +1,543 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import math
+import types
+
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from . import clip
+
+activations = {}
+
+
+def get_activation(name):
+
+    def hook(model, input, output):
+        activations[name] = output
+
+    return hook
+
+
+attention = {}
+
+
+def get_attention(name):
+
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x).reshape(B, N, 3, module.num_heads,
+                                  C // module.num_heads).permute(
+                                      2, 0, 3, 1, 4))
+        q, k, _ = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+
+    return hook
+
+
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16,
+                                         shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0)
+
+    all_attn = torch.mean(attn, 0)
+
+    return all_attn
+
+
+class Slice(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(nn.Module):
+
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_features, in_features), nn.GELU())
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+
+        return self.project(features)
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+
+
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+
+    # encoder
+    _ = pretrained.model.forward_flex(x)
+
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+
+    return layer_1, layer_2, layer_3, layer_4
+
+
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+
+    gs_old = int(math.sqrt(len(posemb_grid)))
+
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+    return posemb
+
+
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+
+    B = x.shape[0]
+
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+
+    x = x + pos_embed
+    x = self.pos_drop(x)
+
+    gradient_checkpoint = False
+    for blk in self.blocks:
+        if gradient_checkpoint:
+            x = checkpoint.checkpoint(blk, x)
+        else:
+            x = blk(x)
+
+    x = self.norm(x)
+
+    return x
+
+
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+
+    return readout_oper
+
+
+def adapt_input_conv(in_chans, conv_weight):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float(
+    )  # Some weights are in torch.half, ensure it's float for sum on CPU
+    O, II, J, K = conv_weight.shape
+    if in_chans == 1:
+        if II > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, II // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            conv_weight = conv_weight.sum(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if II != 3:
+            raise NotImplementedError(
+                'Weight format not supported by conversion.')
+        else:
+            # NOTE this strategy should be better than random init, but there could be other combinations of
+            # the original RGB input layer weights that'd work better for specific cases.
+            repeat = int(math.ceil(in_chans / 3))
+            conv_weight = conv_weight.repeat(1, repeat, 1,
+                                             1)[:, :in_chans, :, :]
+            conv_weight *= (3 / float(in_chans))
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+
+
+@torch.no_grad()
+def _load_weights(model, checkpoint_path, prefix=''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(
+            adapt_input_conv(stem.conv.weight.shape[1],
+                             _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(
+                            _n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(
+                            _n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(
+                            _n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
+                                        _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(
+        w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
+                                                  1),
+            model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+    if isinstance(
+            model.head, nn.Linear
+    ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
+    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.qkv.bias.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.proj.weight.copy_(
+            _n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+
+def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = posemb_new.shape[1]
+    if num_prefix_tokens:
+        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
+            0, num_prefix_tokens:]
+        ntok_new -= num_prefix_tokens
+    else:
+        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3,
+                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
+    return posemb
+
+
+def _make_pretrained_clip_vitl16_384(pretrained,
+                                     use_readout='ignore',
+                                     hooks=None,
+                                     enable_attention_hooks=False):
+    clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False)
+
+    # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    model = timm.create_model('vit_large_patch16_384', pretrained=False)
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    pretrained = _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+    return clip_pretrained, pretrained
+
+
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention('attn_1'))
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention('attn_2'))
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention('attn_3'))
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention('attn_4'))
+        pretrained.attention = attention
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py
new file mode 100644
index 00000000..ece10bab
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -0,0 +1,458 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu3(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        return x.squeeze(0)
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, width, layers, heads, attn_mask=None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+
+    def __init__(
+            self,
+            embed_dim: int,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int,
+            # text
+            context_length: int,
+            vocab_size: int,
+            transformer_width: int,
+            transformer_heads: int,
+            transformer_layers: int):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim)
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith('bn3.weight'):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.transformer.width**-0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(ll):
+        if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            ll.weight.data = ll.weight.data.half()
+            if ll.bias is not None:
+                ll.bias.data = ll.bias.data.half()
+
+        if isinstance(ll, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(ll, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(ll, name):
+                attr = getattr(ll, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model():
+    model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12)
+    convert_weights(model)
+    return model.eval()
diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
new file mode 100644
index 00000000..250d680f
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -0,0 +1,156 @@
+""" CLIP
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            error_list = []
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    new_word.extend(word[i:])
+                    error_list.append(err)
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 7d6cdb59..6fada2b0 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -243,6 +243,13 @@ TASK_OUTPUTS = {
     #    "output_img": np.ndarray with shape [height, width, 3]
     # }
     Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
+    # text driven segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.text_driven_segmentation: [OutputKeys.MASKS],
 
     # movide scene segmentation result for a single video
     # {
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index c9f0c252..40c237c8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -149,6 +149,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitb_video-single-object-tracking_ostrack'),
     Tasks.image_reid_person: (Pipelines.image_reid_person,
                               'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.text_driven_segmentation:
+    (Pipelines.text_driven_segmentation,
+     'damo/cv_vitl16_segmentation_text-driven-seg'),
     Tasks.movie_scene_segmentation:
     (Pipelines.movie_scene_segmentation,
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index f4e6792b..c8cb0c6a 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -44,6 +44,7 @@ if TYPE_CHECKING:
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
 
 else:
@@ -97,6 +98,8 @@ else:
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'easycv_pipeline':
         ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'text_driven_segmentation_pipeline':
+        ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
     }
diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
new file mode 100644
index 00000000..0985b835
--- /dev/null
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Pipelines.text_driven_segmentation)
+class TextDrivenSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Dict) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input['image'])
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w,
+            'text': input['text'],
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = self.model.inference(input['img'], input['text'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2265ef5a..ed1ec798 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -36,6 +36,7 @@ class CVTasks(object):
 
     image_segmentation = 'image-segmentation'
     portrait_matting = 'portrait-matting'
+    text_driven_segmentation = 'text-driven-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
new file mode 100644
index 00000000..741787d9
--- /dev/null
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TextDrivenSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_text_driven_segmentation(self):
+        input_location = 'data/test/images/text_driven_segmentation.jpg'
+        test_input = {
+            'image': input_location,
+            'text': 'bear',
+        }
+        model_id = 'damo/cv_vitl16_segmentation_text-driven-seg'
+        shop_seg = pipeline(Tasks.text_driven_segmentation, model=model_id)
+        result = shop_seg(test_input)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 5a2634610a3e1efca692327ab31988313574156d Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Fri, 2 Sep 2022 20:03:19 +0800
Subject: [PATCH 044/175] [to #42322933]skip sbert_en&bert_ch to save ci time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

![](https://cn-hangzhou.oss-cdn.aliyun-inc.com/git/force/uploads/comment/251924/40165669611078357/image.png)
fill mask pipeline 测试时间过长

这个task测了4个模型。从保证代码正确性的功能角度看，只测一个bert类(比如sbert中文），一个roberta类（veco)。减少测试的模型数量以减少测试时长。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006556

    * skip sbert_en&bert_ch to save ci time
---
 tests/pipelines/test_fill_mask.py | 38 ++-----------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 1b709e27..6b37f6df 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -43,7 +43,7 @@ class FillMaskTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
@@ -74,24 +74,10 @@ class FillMaskTest(unittest.TestCase):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
-        # zh bert
-        language = 'zh'
-        model_dir = snapshot_download(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
-            model_dir, first_sequence='sentence', second_sequence=None)
-        model = BertForMaskedLM.from_pretrained(model_dir)
-        pipeline1 = FillMaskPipeline(model, preprocessor)
-        pipeline2 = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        ori_text = self.ori_texts[language]
-        test_input = self.test_inputs[language]
-        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
-              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
@@ -121,20 +107,6 @@ class FillMaskTest(unittest.TestCase):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
-        # zh bert
-        model = Model.from_pretrained(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        language = 'zh'
-        ori_text = self.ori_texts[language]
-        test_input = self.test_inputs[language]
-        with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'fill_mask_bert_zh'):
-            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-                  f'{pipeline_ins(test_input)}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
@@ -153,12 +125,6 @@ class FillMaskTest(unittest.TestCase):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
-        # bert
-        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
-        print(
-            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-            f'{pipeline_ins(self.test_inputs[language])}\n')
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)

From 4073376f512af16fb62814bade1482d2deb55236 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Fri, 2 Sep 2022 20:53:29 +0800
Subject: [PATCH 045/175] [to #42322933]add face 2d keypoints by EasyCV        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9934673

    * add face 2d keypoints
---
 .../test_img_face_2d_keypoints.png            |  3 ++
 modelscope/metainfo.py                        |  3 ++
 modelscope/models/cv/__init__.py              |  6 +--
 .../models/cv/face_2d_keypoints/__init__.py   | 20 +++++++++
 .../face_2d_keypoints_align.py                | 16 ++++++++
 .../cv/face_2d_keypoins/__init__.py           | 20 +++++++++
 .../face_2d_keypoints_dataset.py              | 13 ++++++
 modelscope/outputs.py                         |  9 ++++
 modelscope/pipelines/builder.py               |  2 +
 modelscope/pipelines/cv/__init__.py           |  8 ++--
 .../pipelines/cv/easycv_pipelines/__init__.py |  4 +-
 .../face_2d_keypoints_pipeline.py             | 41 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 tests/pipelines/test_face_2d_keypoints.py     | 36 ++++++++++++++++
 14 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
 create mode 100644 modelscope/models/cv/face_2d_keypoints/__init__.py
 create mode 100644 modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
 create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
 create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
 create mode 100644 tests/pipelines/test_face_2d_keypoints.py

diff --git a/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
new file mode 100644
index 00000000..00311c33
--- /dev/null
+++ b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
+size 54390
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 3225710a..06b5a476 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -24,6 +24,7 @@ class Models(object):
     body_2d_keypoints = 'body-2d-keypoints'
     body_3d_keypoints = 'body-3d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
+    face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
     video_summarization = 'pgl-video-summarization'
@@ -112,6 +113,7 @@ class Pipelines(object):
     object_detection = 'vit-object-detection'
     easycv_detection = 'easycv-detection'
     easycv_segmentation = 'easycv-segmentation'
+    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -353,6 +355,7 @@ class Datasets(object):
     """ Names for different datasets.
     """
     ClsDataset = 'ClsDataset'
+    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 331f23bd..4db43d17 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -3,9 +3,9 @@
 # yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
-               crowd_counting, face_detection, face_generation,
-               image_classification, image_color_enhance, image_colorization,
-               image_denoise, image_instance_segmentation,
+               crowd_counting, face_2d_keypoints, face_detection,
+               face_generation, image_classification, image_color_enhance,
+               image_colorization, image_denoise, image_instance_segmentation,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
diff --git a/modelscope/models/cv/face_2d_keypoints/__init__.py b/modelscope/models/cv/face_2d_keypoints/__init__.py
new file mode 100644
index 00000000..636ba0f4
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_align import Face2DKeypoints
+
+else:
+    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
new file mode 100644
index 00000000..468662a0
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.face.face_keypoint import FaceKeypoint
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
+class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        FaceKeypoint.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
new file mode 100644
index 00000000..e9d76b7e
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_dataset import FaceKeypointDataset
+
+else:
+    _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
new file mode 100644
index 00000000..a902999d
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -0,0 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.face_2d_keypoints,
+    module_name=Datasets.Face2dKeypointsDataset)
+class FaceKeypointDataset(_FaceKeypointDataset):
+    """EasyCV dataset for face 2d keypoints."""
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 6fada2b0..e84c8dcc 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -57,6 +57,15 @@ TASK_OUTPUTS = {
     # }
     Tasks.ocr_recognition: [OutputKeys.TEXT],
 
+    # face 2d keypoint result for single sample
+    #   {
+    #       "keypoints": [
+    #           [x1, y1]*106
+    #       ],
+    #       "poses": [pitch, roll, yaw]
+    #   }
+    Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],
+
     # face detection result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05]
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 40c237c8..f43d152b 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -103,6 +103,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
+                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
     Tasks.video_multi_modal_embedding:
     (Pipelines.video_multi_modal_embedding,
      'damo/multi_modal_clip_vtretrival_msrvtt_53'),
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index c8cb0c6a..9e7d80ee 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -43,7 +43,7 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
-    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
 
@@ -96,8 +96,10 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
-        'easycv_pipeline':
-        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'easycv_pipeline': [
+            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
+            'Face2DKeypointsPipeline'
+        ],
         'text_driven_segmentation_pipeline':
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
index 0984ff43..4f149130 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .detection_pipeline import EasyCVDetectionPipeline
     from .segmentation_pipeline import EasyCVSegmentationPipeline
+    from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
 else:
     _import_structure = {
         'detection_pipeline': ['EasyCVDetectionPipeline'],
-        'segmentation_pipeline': ['EasyCVSegmentationPipeline']
+        'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
+        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..eb4d6c15
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
+class Face2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for face 2d keypoints detection."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(Face2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def show_result(self, img, points, scale=2, save_path=None):
+        return self.predict_op.show_result(img, points, scale, save_path)
+
+    def __call__(self, inputs) -> Any:
+        output = self.predict_op(inputs)[0][0]
+        points = output['point']
+        poses = output['pose']
+
+        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index ed1ec798..86808ea1 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -20,6 +20,7 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_recognition = 'face-recognition'
+    face_2d_keypoints = 'face-2d-keypoints'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
new file mode 100644
index 00000000..a5e347e8
--- /dev/null
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_2d_keypoints(self):
+        img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png'
+        model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment'
+
+        face_2d_keypoints_align = pipeline(
+            task=Tasks.face_2d_keypoints, model=model_id)
+        output = face_2d_keypoints_align(img_path)
+
+        output_keypoints = output[OutputKeys.KEYPOINTS]
+        output_pose = output[OutputKeys.POSES]
+
+        img = cv2.imread(img_path)
+        img = face_2d_keypoints_align.show_result(
+            img, output_keypoints, scale=2, save_path='face_keypoints.jpg')
+
+        self.assertEqual(output_keypoints.shape[0], 106)
+        self.assertEqual(output_keypoints.shape[1], 2)
+        self.assertEqual(output_pose.shape[0], 3)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 00487aa6e1ca1b7ac50b5ca90b3290f2a6068d77 Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Sat, 3 Sep 2022 11:38:07 +0800
Subject: [PATCH 046/175] [to #42322933]add error msg when no text detected for
 ocr_detection task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ocr_detection加上当图片中没有文字时报错的error msg
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10001490
---
 modelscope/pipelines/cv/ocr_detection_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 62248714..b73f65a4 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         rboxes = inputs['combined_rboxes'][0]
         count = inputs['combined_counts'][0]
+        if count == 0 or count < rboxes.shape[0]:
+            raise Exception('modelscope error: No text detected')
         rboxes = rboxes[:count, :]
 
         # convert rboxes to polygons and find its coordinates on the original image

From 4f72134adf6f6154e5eb02602b33f2066426dbe4 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Sat, 3 Sep 2022 11:50:01 +0800
Subject: [PATCH 047/175] [to #42322933]update test video for movie scene
 segmentation         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10007852

    * update test video for movie scene segmentation
---
 data/test/videos/movie_scene_segmentation_test_video.mp4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4
index ee6ed528..21ea3cb1 100644
--- a/data/test/videos/movie_scene_segmentation_test_video.mp4
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
-size 126815483
+oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443
+size 12722029

From ba74cdf97e8944e724b78cdfaf43f2de0fed721b Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 3 Sep 2022 12:10:16 +0800
Subject: [PATCH 048/175] [to #43878347] Rename runtime.txt  to framework.txt  
       Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10000642

    * rename runtime.txt  to framework.txt
---
 .readthedocs.yaml                           | 2 +-
 docker/Dockerfile.ubuntu                    | 2 +-
 requirements.txt                            | 2 +-
 requirements/{runtime.txt => framework.txt} | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename requirements/{runtime.txt => framework.txt} (100%)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index b88d734a..f7b9c7ea 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -25,4 +25,4 @@ python:
   install:
     - requirements: requirements/docs.txt
     - requirements: requirements/readthedocs.txt
-    - requirements: requirements/runtime.txt
+    - requirements: requirements/framework.txt
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 97881007..78da0b6f 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
diff --git a/requirements.txt b/requirements.txt
index c6e294ba..0832e6ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
--r requirements/runtime.txt
+-r requirements/framework.txt
diff --git a/requirements/runtime.txt b/requirements/framework.txt
similarity index 100%
rename from requirements/runtime.txt
rename to requirements/framework.txt

From 39a309b6554070e68741a36593211ab47910a293 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sat, 3 Sep 2022 12:18:29 +0800
Subject: [PATCH 049/175]  [to #42322933] reduce train epoch from 3 to w

---
 tests/trainers/test_finetune_mplug.py               | 2 +-
 tests/trainers/test_finetune_token_classificatin.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index 351600c6..b46dbf45 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -35,7 +35,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             }).rename_column('image:FILE',
                              'image').rename_column('answer:Value', 'answer'))
 
-        self.max_epochs = 3
+        self.max_epochs = 2
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index c34410be..9bdab9b7 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -92,7 +92,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
                 }
             }
             cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
-            cfg.train.max_epochs = 3
+            cfg.train.max_epochs = 2
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,

From 04516276265f27996b2ffb293f3ef6315055d0d7 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Sat, 3 Sep 2022 13:21:31 +0800
Subject: [PATCH 050/175] =?UTF-8?q?[to=20#42322933]=E5=95=86=E5=93=81?=
 =?UTF-8?q?=E6=98=BE=E8=91=97=E6=80=A7=E5=88=86=E5=89=B2v1.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

商品显著性检测模型，依赖opencv，mmcv-full
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9909897
---
 data/test/images/shop_segmentation.jpg        |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |   2 +-
 .../models/cv/shop_segmentation/__init__.py   |   1 +
 .../models/cv/shop_segmentation/common.py     |  59 ++
 .../models/cv/shop_segmentation/head_fpn.py   | 122 +++
 .../models/cv/shop_segmentation/models.py     | 901 ++++++++++++++++++
 .../models/cv/shop_segmentation/neck_fpn.py   | 217 +++++
 .../cv/shop_segmentation/shop_seg_base.py     | 157 +++
 .../cv/shop_segmentation/shop_seg_model.py    | 115 +++
 .../models/cv/shop_segmentation/utils.py      | 199 ++++
 modelscope/outputs.py                         |   8 +-
 modelscope/pipelines/builder.py               |   4 +-
 modelscope/pipelines/cv/__init__.py           |   3 +-
 .../cv/shop_segmentation_pipleline.py         |  51 +
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_shop_segmentation.py     |  24 +
 17 files changed, 1865 insertions(+), 4 deletions(-)
 create mode 100644 data/test/images/shop_segmentation.jpg
 create mode 100644 modelscope/models/cv/shop_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/shop_segmentation/common.py
 create mode 100644 modelscope/models/cv/shop_segmentation/head_fpn.py
 create mode 100644 modelscope/models/cv/shop_segmentation/models.py
 create mode 100644 modelscope/models/cv/shop_segmentation/neck_fpn.py
 create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_base.py
 create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_model.py
 create mode 100644 modelscope/models/cv/shop_segmentation/utils.py
 create mode 100644 modelscope/pipelines/cv/shop_segmentation_pipleline.py
 create mode 100644 tests/pipelines/test_shop_segmentation.py

diff --git a/data/test/images/shop_segmentation.jpg b/data/test/images/shop_segmentation.jpg
new file mode 100644
index 00000000..ec02881d
--- /dev/null
+++ b/data/test/images/shop_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0
+size 238607
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 06b5a476..b1bf9600 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    shop_segmentation = 'shop-segmentation'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -148,6 +149,7 @@ class Pipelines(object):
     image_reid_person = 'passvitb-image-reid-person'
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
+    shop_segmentation = 'shop-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 4db43d17..f2798b59 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -11,7 +11,7 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_to_image_generation, image_to_image_translation,
                movie_scene_segmentation, object_detection,
                product_retrieval_embedding, realtime_object_detection,
-               salient_detection, super_resolution,
+               salient_detection, shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py
new file mode 100644
index 00000000..b40a0760
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/__init__.py
@@ -0,0 +1 @@
+from .shop_seg_base import SHOPSEG
diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py
new file mode 100644
index 00000000..00ba9996
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -0,0 +1,59 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
new file mode 100644
index 00000000..b3faa9b8
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -0,0 +1,122 @@
+""" FPNHead
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import Upsample, resize
+
+
+class FPNHead(nn.Module):
+    """Panoptic Feature Pyramid Networks.
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self,
+                 channels,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 feature_strides=[4, 8, 16, 32],
+                 align_corners=False,
+                 **kwargs):
+        super(FPNHead, self).__init__()
+        self.act_cfg = dict(type='ReLU')
+        self.channels = channels
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.norm_cfg = dict(type='BN2d', requires_grad=True)
+        self.align_corners = align_corners
+        self.dropout_ratio = dropout_ratio
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.in_index = [0, 1, 2, 3]
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        self.apply(self._init_weights)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        inputs = [inputs[i] for i in self.in_index]
+        return inputs
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
new file mode 100644
index 00000000..8b82d1d1
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -0,0 +1,901 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop, drop_path, trunc_normal_
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.spacial_dim = spacial_dim
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      x.shape[2] * x.shape[3]).permute(2, 0,
+                                                       1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+
+        cls_pos = self.positional_embedding[0:1, :]
+        spatial_pos = F.interpolate(
+            self.positional_embedding[1:, ].reshape(1, self.spacial_dim,
+                                                    self.spacial_dim,
+                                                    self.embed_dim).permute(
+                                                        0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0)
+        positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
+
+        x = x + positional_embedding[:, None, :]
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+
+        x = x.permute(1, 2, 0)
+        global_feat = x[:, :, 0]
+        feature_map = x[:, :, 1:].reshape(B, -1, H, W)
+        return global_feat, feature_map
+
+
+class CLIPResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=512,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        return tuple(outs)
+
+
+class CLIPResNetWithAttention(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=1024,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32,
+                                        output_dim)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+                    if 'positional_embedding' in new_k:
+                        if self.attnpool.positional_embedding.shape != state_dict[
+                                new_k].shape:
+                            print(
+                                f'Resize the pos_embed shape from {state_dict[new_k].shape}'
+                                f' to {self.attnpool.positional_embedding.shape}'
+                            )
+                            cls_pos = state_dict[new_k][0:1, :]
+                            H = W = self.input_resolution // 32
+                            old_h = int(
+                                math.sqrt(state_dict[new_k][1:, ].shape[0]))
+                            spatial_pos = F.interpolate(
+                                state_dict[new_k][1:, ].reshape(
+                                    1, old_h, old_h,
+                                    cls_pos.shape[1]).permute(0, 3, 1, 2),
+                                size=(H, W),
+                                mode='bilinear')
+                            spatial_pos = spatial_pos.reshape(
+                                cls_pos.shape[1], H * W).permute(1, 0)
+                            positional_embedding = torch.cat(
+                                [cls_pos, spatial_pos], dim=0)
+                            state_dict[new_k] = positional_embedding
+                            assert self.attnpool.positional_embedding.shape == state_dict[
+                                new_k].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        x_global, x_local = self.attnpool(x)
+        outs.append([x_global, x_local])
+
+        return tuple(outs)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path=0.):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.drop_path(self.attention(self.ln_1(x)))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path_rate=0.):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)
+               ]  # stochastic depth decay rule
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
+            for i in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, q, k, v):
+        B, N, C = q.shape
+        assert k.shape == v.shape
+        B, M, C = k.shape
+        q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
+        k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
+        v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)
+
+        attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale
+
+        attn = attn.softmax(dim=-1)
+
+        x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
+        self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout),
+            nn.Linear(d_model * 4, d_model))
+
+    def forward(self, x, mem):
+        q = k = v = self.norm1(x)
+        x = x + self.self_attn(q, k, v)
+        q = self.norm2(x)
+        x = x + self.cross_attn(q, mem, mem)
+        x = x + self.dropout(self.mlp(self.norm3(x)))
+        return x
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution=224,
+                 patch_size=32,
+                 width=768,
+                 layers=12,
+                 heads=12,
+                 output_dim=512,
+                 drop_path_rate=0.0,
+                 out_indices=[3, 5, 7, 11],
+                 pretrained=None,
+                 get_embeddings=False,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.spatial_size = input_resolution // patch_size
+        self.ln_pre = LayerNorm(width)
+        self.get_embeddings = get_embeddings
+
+        self.transformer = Transformer(
+            width, layers, heads, drop_path_rate=drop_path_rate)
+
+        self.out_indices = out_indices
+
+        if get_embeddings:
+            self.ln_post = LayerNorm(width)
+            self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+        embed_dim = width
+
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.SyncBatchNorm(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn3 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            if 'positional_embedding' in state_dict.keys():
+                if self.positional_embedding.shape != state_dict[
+                        'positional_embedding'].shape:
+                    print(
+                        f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to'
+                        f' {self.positional_embedding.shape}')
+                    cls_pos = state_dict['positional_embedding'][0:1, :]
+                    spatial_pos = F.interpolate(
+                        state_dict['positional_embedding'][1:, ].reshape(
+                            1, 14, 14, 768).permute(0, 3, 1, 2),
+                        size=(self.spatial_size, self.spatial_size),
+                        mode='bilinear')
+                    spatial_pos = spatial_pos.reshape(
+                        768,
+                        self.spatial_size * self.spatial_size).permute(1, 0)
+                    positional_embedding = torch.cat([cls_pos, spatial_pos],
+                                                     dim=0)
+                    state_dict['positional_embedding'] = positional_embedding
+                    assert self.positional_embedding.shape == state_dict[
+                        'positional_embedding'].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in vision transformer')
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)
+        pos = self.positional_embedding.to(x.dtype)
+        cls_pos = pos[0, :] + self.class_embedding.to(x.dtype)
+        spatial_pos = F.interpolate(
+            pos[1:, ].reshape(1, self.spatial_size, self.spatial_size,
+                              C).permute(0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1)
+        pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
+        x = x + pos
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+
+        gradientcheckpoint = False
+
+        features = []
+        for i, blk in enumerate(self.transformer.resblocks):
+            if gradientcheckpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+            if i in self.out_indices:
+                xp = x.permute(1, 0, 2)[:,
+                                        1:, :].permute(0, 2,
+                                                       1).reshape(B, -1, H, W)
+                features.append(xp.contiguous())
+
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+
+        if self.get_embeddings:
+            x = x.permute(1, 0, 2)
+            x = self.ln_post(x)
+            x = x @ self.proj
+
+            global_embedding = x[:, 0]
+            visual_embedding = x[:, 1:].reshape(B, H, W,
+                                                -1).permute(0, 3, 1,
+                                                            2)  # B C H W
+
+            features.append([global_embedding, visual_embedding])
+
+        return tuple(features)
+
+
+class CLIPTextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=77,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text):
+        x = self.token_embedding(text)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1), ...] @ self.text_projection
+        return x
+
+
+class CLIPTextContextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=22,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.embed_dim = embed_dim
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text, context=None):
+        x_text = self.token_embedding(text)  # n_clas, n_text, C
+        K, N1, C = x_text.shape  # 150类 * 5??? * 512
+        B, N2, C = context.shape  # 1 * 8 * 512
+
+        eos_indx = text.argmax(dim=-1) + N2
+        eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1)
+
+        x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C)
+        context = context.reshape(B, 1, N2, C).expand(B, K, N2, C)
+
+        x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]],
+                      dim=2).reshape(B * K, N1 + N2, C)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection
+        x = x.reshape(B, K, self.embed_dim)
+        return x
+
+
+class ContextDecoder(nn.Module):
+
+    def __init__(self,
+                 transformer_width=256,
+                 transformer_heads=4,
+                 transformer_layers=6,
+                 visual_dim=1024,
+                 dropout=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.memory_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+            nn.LayerNorm(transformer_width),
+        )
+
+        self.text_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+        )
+
+        self.decoder = nn.ModuleList([
+            TransformerDecoderLayer(transformer_width, transformer_heads,
+                                    dropout) for _ in range(transformer_layers)
+        ])
+
+        self.out_proj = nn.Sequential(
+            nn.LayerNorm(transformer_width),
+            nn.Linear(transformer_width, visual_dim))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, text, visual):
+        B, N, C = visual.shape
+        visual = self.memory_proj(visual)
+        x = self.text_proj(text)
+
+        for layer in self.decoder:
+            x = layer(x, visual)
+
+        return self.out_proj(x)
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
new file mode 100644
index 00000000..108cb043
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -0,0 +1,217 @@
+""" FPNneck
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import resize
+
+
+class FPN(nn.Module):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+        self.apply(self._init_weights)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
new file mode 100644
index 00000000..e3ae0d54
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -0,0 +1,157 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .head_fpn import FPNHead
+from .models import (CLIPTextContextEncoder, CLIPVisionTransformer,
+                     ContextDecoder)
+from .neck_fpn import FPN
+from .utils import SimpleTokenizer, tokenize
+
+
+class SHOPSEG(nn.Module):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 context_length=22,
+                 context_feature='attention',
+                 score_concat_index=2,
+                 tau=0.07,
+                 token_embed_dim=512,
+                 text_dim=512,
+                 **args):
+        super(SHOPSEG, self).__init__()
+
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+        backbone = CLIPVisionTransformer(
+            input_resolution=1024,
+            patch_size=16,
+            width=768,
+            layers=12,
+            output_dim=512,
+            drop_path_rate=0.1,
+            pretrained=False,
+            get_embeddings=True)
+
+        text_encoder = CLIPTextContextEncoder(
+            context_length=30,
+            vocab_size=49408,
+            transformer_width=512,
+            transformer_heads=8,
+            transformer_layers=12,
+            embed_dim=512,
+            pretrained=False)
+
+        context_decoder = ContextDecoder(
+            transformer_width=256,
+            transformer_heads=4,
+            transformer_layers=3,
+            visual_dim=512,
+            dropout=0.1)
+        neck = FPN(
+            in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4)
+        head_fpd = FPNHead(channels=256, num_classes=2)
+
+        self.backbone = backbone
+        self.text_encoder = text_encoder
+        self.context_decoder = context_decoder
+        self.context_length = context_length
+        self.score_concat_index = score_concat_index
+
+        self.context_feature = context_feature
+        self.tau = tau
+        context_length = self.text_encoder.context_length - self.context_length
+        self.contexts = nn.Parameter(
+            torch.randn(1, context_length, token_embed_dim))
+        nn.init.trunc_normal_(self.contexts)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)
+
+        self.neck = neck
+        self.head_fpn = head_fpd
+
+        self.tau = 0.07
+
+    def encode_text(self, text, context_length):
+        output = tokenize(self.tokenizer, text, context_length, True)
+        return output
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        return x
+
+    def after_extract_feat(self, x, name_list):
+        x_orig = list(x[0:4])
+        global_feat, visual_embeddings = x[4]
+        B, C, H, W = visual_embeddings.shape
+        if self.context_feature == 'attention':
+            x1 = global_feat.reshape(B, C, 1)
+            x2 = visual_embeddings.reshape(B, C, H * W)
+            visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1)
+        texts = torch.cat([
+            self.encode_text(c, context_length=self.context_length)
+            for c in name_list
+        ])
+        x1 = texts.to(global_feat.device)
+        x1 = self.text_encoder(x1, self.contexts)
+        text_embeddings = x1.expand(B, -1, -1)
+        # update text_embeddings by visual_context!
+        # (B, 1, C)
+        text_diff = self.context_decoder(text_embeddings, visual_context)
+        # (B, K, C)
+        text_embeddings = text_embeddings + self.gamma * text_diff
+
+        # compute score map and concat
+        B, K, C = text_embeddings.shape
+        visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2)
+        text = F.normalize(text_embeddings, dim=2, p=2)
+        score_map_list = []
+        bsz = B
+        for i in range(bsz):
+            ind = 2 * i
+            sub_text = torch.cat(
+                [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]],
+                dim=1)  # 1 * 2 * h * w
+
+            sub_score_map = torch.einsum('bchw,bkc->bkhw',
+                                         visual_embeddings[i:i + 1],
+                                         sub_text)  # 1 * 2 * h * w
+            score_map_list.append(sub_score_map)
+        score_map = torch.cat(score_map_list, dim=0)  # b * 2 * h * w
+        x_orig[self.score_concat_index] = torch.cat(
+            [x_orig[self.score_concat_index], score_map], dim=1)
+        return x_orig, score_map
+
+    def forward(self, img, text_list=None):
+        if text_list is None:
+            bsz = img.size()[0]
+            text_list = ['foregeound'] * bsz
+        x = self.extract_feat(img)
+        _x_orig = [x[i] for i in range(4)]
+        name_list = []
+        for name in text_list:
+            name_list.append('others')
+            name_list.append(name[0:20])
+        x_orig, score_map = self.after_extract_feat(x, name_list)
+        x_orig = list(self.neck(x_orig))
+        _x_orig = x_orig
+        pred = self.head_fpn(_x_orig)
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
new file mode 100644
index 00000000..409c583b
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -0,0 +1,115 @@
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.shop_segmentation import SHOPSEG
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ShopSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.shop_segmentation, module_name=Models.shop_segmentation)
+class ShopSegmentation(TorchModel):
+    """ shop segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = SHOPSEG(model_dir=model_dir)
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+    def preprocess(self, img, size=1024):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=1024)
+        pred = self.inference(image_tensor)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024)
+
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image):
+        """
+        image should be tensor, 1 * 3 * 1024 * 1024
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device))
+            output = F.interpolate(output, size=(1024, 1024), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+
+            del output
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py
new file mode 100644
index 00000000..c41f8a65
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -0,0 +1,199 @@
+""" CLIP Tokenizer
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import Any, List, Union
+
+import ftfy
+import regex as re
+import torch
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        error_list = []
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    error_list.append(err)
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+def tokenize(tokenizer,
+             texts,
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = tokenizer.encoder['<|startoftext|>']
+    eot_token = tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index e84c8dcc..8fe71ec2 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -259,7 +259,13 @@ TASK_OUTPUTS = {
     #       ]
     #   }
     Tasks.text_driven_segmentation: [OutputKeys.MASKS],
-
+    # shop segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.shop_segmentation: [OutputKeys.MASKS],
     # movide scene segmentation result for a single video
     # {
     #        "split_video_num":3,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f43d152b..f6381857 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -156,7 +156,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitl16_segmentation_text-driven-seg'),
     Tasks.movie_scene_segmentation:
     (Pipelines.movie_scene_segmentation,
-     'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
+     'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
+    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
+                              'damo/cv_vitb16_segmentation_shop-seg'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 9e7d80ee..d3dba978 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -43,10 +43,10 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
+    from .shop_segmentation_pipleline import ShopSegmentationPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
-
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -96,6 +96,7 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
+        'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
         'easycv_pipeline': [
             'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
             'Face2DKeypointsPipeline'
diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
new file mode 100644
index 00000000..b7fd90b4
--- /dev/null
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation)
+class ShopSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 86808ea1..1b738bfe 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_segmentation = 'image-segmentation'
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
+    shop_segmentation = 'shop-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_shop_segmentation.py b/tests/pipelines/test_shop_segmentation.py
new file mode 100644
index 00000000..58c56dd7
--- /dev/null
+++ b/tests/pipelines/test_shop_segmentation.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ShopSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_shop_segmentation(self):
+        input_location = 'data/test/images/shop_segmentation.jpg'
+        model_id = 'damo/cv_vitb16_segmentation_shop-seg'
+        shop_seg = pipeline(Tasks.shop_segmentation, model=model_id)
+        result = shop_seg(input_location)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_shopseg.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()

From f508be89183cc2d9047bbb6fcbe23685a239959d Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Sat, 3 Sep 2022 23:48:42 +0800
Subject: [PATCH 051/175] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9ERe?=
 =?UTF-8?q?tinaFace=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 新增人脸检测RetinaFace模型；
2. 完成Maas-cv CR标准自查
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9945188
---
 data/test/images/retina_face_detection.jpg    |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/face_detection/retinaface/__init__.py  |   0
 .../cv/face_detection/retinaface/detection.py | 137 ++++++++++++++++
 .../retinaface/models/__init__.py             |   0
 .../face_detection/retinaface/models/net.py   | 149 ++++++++++++++++++
 .../retinaface/models/retinaface.py           | 145 +++++++++++++++++
 .../cv/face_detection/retinaface/utils.py     | 123 +++++++++++++++
 modelscope/pipelines/base.py                  |   1 -
 .../cv/retina_face_detection_pipeline.py      |  55 +++++++
 tests/pipelines/test_retina_face_detection.py |  33 ++++
 11 files changed, 647 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/retina_face_detection.jpg
 create mode 100644 modelscope/models/cv/face_detection/retinaface/__init__.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/detection.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/__init__.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/net.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/retinaface.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/utils.py
 create mode 100644 modelscope/pipelines/cv/retina_face_detection_pipeline.py
 create mode 100644 tests/pipelines/test_retina_face_detection.py

diff --git a/data/test/images/retina_face_detection.jpg b/data/test/images/retina_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/retina_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index b1bf9600..9638268c 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
 
     # EasyCV models
@@ -118,6 +119,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    retina_face_detection = 'resnet50-face-detection-retinaface'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/retinaface/detection.py b/modelscope/models/cv/face_detection/retinaface/detection.py
new file mode 100755
index 00000000..3dd31659
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/detection.py
@@ -0,0 +1,137 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .models.retinaface import RetinaFace
+from .utils import PriorBox, decode, decode_landm, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface)
+class RetinaFaceDetection(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.cfg = Config.from_file(
+            model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                               ModelFile.CONFIGURATION))['models']
+        self.net = RetinaFace(cfg=self.cfg)
+        self.load_model()
+        self.device = device
+        self.net = self.net.to(self.device)
+
+        self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device)
+
+    def check_keys(self, pretrained_state_dict):
+        ckpt_keys = set(pretrained_state_dict.keys())
+        model_keys = set(self.net.state_dict().keys())
+        used_pretrained_keys = model_keys & ckpt_keys
+        assert len(
+            used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
+        return True
+
+    def remove_prefix(self, state_dict, prefix):
+        new_state_dict = dict()
+        for k, v in state_dict.items():
+            if k.startswith(prefix):
+                new_state_dict[k[len(prefix):]] = v
+            else:
+                new_state_dict[k] = v
+        return new_state_dict
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        if 'state_dict' in pretrained_dict.keys():
+            pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'],
+                                                 'module.')
+        else:
+            pretrained_dict = self.remove_prefix(pretrained_dict, 'module.')
+        self.check_keys(pretrained_dict)
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img'].cpu().numpy()
+        img = np.float32(img_raw)
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= (104, 117, 123)
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        loc, conf, landms = self.net(img)  # forward pass
+        del img
+
+        confidence_threshold = 0.9
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = PriorBox(self.cfg, image_size=(im_height, im_width))
+        priors = priorbox.forward()
+        priors = priors.to(self.device)
+        prior_data = priors.data
+        boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance'])
+        boxes = boxes * scale
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
+        landms = decode_landm(
+            landms.data.squeeze(0), prior_data, self.cfg['variance'])
+        scale1 = torch.Tensor([
+            im_width, im_height, im_width, im_height, im_width, im_height,
+            im_width, im_height, im_width, im_height
+        ])
+        scale1 = scale1.to(self.device)
+        landms = landms * scale1
+        landms = landms.cpu().numpy()
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        landms = landms[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        landms = landms[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+        landms = landms[keep]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+        landms = landms[:keep_top_k, :]
+
+        landms = landms.reshape((-1, 5, 2))
+        landms = landms.reshape(
+            -1,
+            10,
+        )
+        return dets / ss, landms / ss
diff --git a/modelscope/models/cv/face_detection/retinaface/models/__init__.py b/modelscope/models/cv/face_detection/retinaface/models/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/retinaface/models/net.py b/modelscope/models/cv/face_detection/retinaface/models/net.py
new file mode 100755
index 00000000..3be7c4b9
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/net.py
@@ -0,0 +1,149 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+from torch.autograd import Variable
+
+
+def conv_bn(inp, oup, stride=1, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_bn_no_relu(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+    )
+
+
+def conv_bn1X1(inp, oup, stride, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
+        nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_dw(inp, oup, stride, leaky=0.1):
+    return nn.Sequential(
+        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+        nn.BatchNorm2d(inp),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+    )
+
+
+class SSH(nn.Module):
+
+    def __init__(self, in_channel, out_channel):
+        super(SSH, self).__init__()
+        assert out_channel % 4 == 0
+        leaky = 0
+        if (out_channel <= 64):
+            leaky = 0.1
+        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1)
+
+        self.conv5X5_1 = conv_bn(
+            in_channel, out_channel // 4, stride=1, leaky=leaky)
+        self.conv5X5_2 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+        self.conv7X7_2 = conv_bn(
+            out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
+        self.conv7x7_3 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+    def forward(self, input):
+        conv3X3 = self.conv3X3(input)
+
+        conv5X5_1 = self.conv5X5_1(input)
+        conv5X5 = self.conv5X5_2(conv5X5_1)
+
+        conv7X7_2 = self.conv7X7_2(conv5X5_1)
+        conv7X7 = self.conv7x7_3(conv7X7_2)
+
+        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
+        out = F.relu(out)
+        return out
+
+
+class FPN(nn.Module):
+
+    def __init__(self, in_channels_list, out_channels):
+        super(FPN, self).__init__()
+        leaky = 0
+        if (out_channels <= 64):
+            leaky = 0.1
+        self.output1 = conv_bn1X1(
+            in_channels_list[0], out_channels, stride=1, leaky=leaky)
+        self.output2 = conv_bn1X1(
+            in_channels_list[1], out_channels, stride=1, leaky=leaky)
+        self.output3 = conv_bn1X1(
+            in_channels_list[2], out_channels, stride=1, leaky=leaky)
+
+        self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
+        self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)
+
+    def forward(self, input):
+        # names = list(input.keys())
+        input = list(input.values())
+
+        output1 = self.output1(input[0])
+        output2 = self.output2(input[1])
+        output3 = self.output3(input[2])
+
+        up3 = F.interpolate(
+            output3, size=[output2.size(2), output2.size(3)], mode='nearest')
+        output2 = output2 + up3
+        output2 = self.merge2(output2)
+
+        up2 = F.interpolate(
+            output2, size=[output1.size(2), output1.size(3)], mode='nearest')
+        output1 = output1 + up2
+        output1 = self.merge1(output1)
+
+        out = [output1, output2, output3]
+        return out
+
+
+class MobileNetV1(nn.Module):
+
+    def __init__(self):
+        super(MobileNetV1, self).__init__()
+        self.stage1 = nn.Sequential(
+            conv_bn(3, 8, 2, leaky=0.1),  # 3
+            conv_dw(8, 16, 1),  # 7
+            conv_dw(16, 32, 2),  # 11
+            conv_dw(32, 32, 1),  # 19
+            conv_dw(32, 64, 2),  # 27
+            conv_dw(64, 64, 1),  # 43
+        )
+        self.stage2 = nn.Sequential(
+            conv_dw(64, 128, 2),  # 43 + 16 = 59
+            conv_dw(128, 128, 1),  # 59 + 32 = 91
+            conv_dw(128, 128, 1),  # 91 + 32 = 123
+            conv_dw(128, 128, 1),  # 123 + 32 = 155
+            conv_dw(128, 128, 1),  # 155 + 32 = 187
+            conv_dw(128, 128, 1),  # 187 + 32 = 219
+        )
+        self.stage3 = nn.Sequential(
+            conv_dw(128, 256, 2),  # 219 +3 2 = 241
+            conv_dw(256, 256, 1),  # 241 + 64 = 301
+        )
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(256, 1000)
+
+    def forward(self, x):
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.avg(x)
+        x = x.view(-1, 256)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/retinaface/models/retinaface.py b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
new file mode 100755
index 00000000..8d2001dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
@@ -0,0 +1,145 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+import torchvision.models.detection.backbone_utils as backbone_utils
+
+from .net import FPN, SSH, MobileNetV1
+
+
+class ClassHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(ClassHead, self).__init__()
+        self.num_anchors = num_anchors
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            self.num_anchors * 2,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 2)
+
+
+class BboxHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(BboxHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 4,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 4)
+
+
+class LandmarkHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(LandmarkHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 10,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 10)
+
+
+class RetinaFace(nn.Module):
+
+    def __init__(self, cfg=None):
+        """
+        :param cfg:  Network related settings.
+        """
+        super(RetinaFace, self).__init__()
+        backbone = None
+        if cfg['name'] == 'Resnet50':
+            backbone = models.resnet50(pretrained=cfg['pretrain'])
+        else:
+            raise Exception('Invalid name')
+
+        self.body = _utils.IntermediateLayerGetter(backbone,
+                                                   cfg['return_layers'])
+        in_channels_stage2 = cfg['in_channel']
+        in_channels_list = [
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ]
+        out_channels = cfg['out_channel']
+        self.fpn = FPN(in_channels_list, out_channels)
+        self.ssh1 = SSH(out_channels, out_channels)
+        self.ssh2 = SSH(out_channels, out_channels)
+        self.ssh3 = SSH(out_channels, out_channels)
+
+        self.ClassHead = self._make_class_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.BboxHead = self._make_bbox_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.LandmarkHead = self._make_landmark_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+
+    def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        classhead = nn.ModuleList()
+        for i in range(fpn_num):
+            classhead.append(ClassHead(inchannels, anchor_num))
+        return classhead
+
+    def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        bboxhead = nn.ModuleList()
+        for i in range(fpn_num):
+            bboxhead.append(BboxHead(inchannels, anchor_num))
+        return bboxhead
+
+    def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        landmarkhead = nn.ModuleList()
+        for i in range(fpn_num):
+            landmarkhead.append(LandmarkHead(inchannels, anchor_num))
+        return landmarkhead
+
+    def forward(self, inputs):
+        out = self.body(inputs)
+
+        # FPN
+        fpn = self.fpn(out)
+
+        # SSH
+        feature1 = self.ssh1(fpn[0])
+        feature2 = self.ssh2(fpn[1])
+        feature3 = self.ssh3(fpn[2])
+        features = [feature1, feature2, feature3]
+
+        bbox_regressions = torch.cat(
+            [self.BboxHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        classifications = torch.cat(
+            [self.ClassHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        ldm_regressions = torch.cat(
+            [self.LandmarkHead[i](feat) for i, feat in enumerate(features)],
+            dim=1)
+
+        output = (bbox_regressions, F.softmax(classifications,
+                                              dim=-1), ldm_regressions)
+        return output
diff --git a/modelscope/models/cv/face_detection/retinaface/utils.py b/modelscope/models/cv/face_detection/retinaface/utils.py
new file mode 100755
index 00000000..60c9e2dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/utils.py
@@ -0,0 +1,123 @@
+# --------------------------------------------------------
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+# --------------------------------------------------------
+
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index c0f3cbd0..d4f9c6bf 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -2,7 +2,6 @@
 
 import os.path as osp
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union
 
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
new file mode 100644
index 00000000..20111c11
--- /dev/null
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -0,0 +1,55 @@
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection.retinaface import detection
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.retina_face_detection)
+class RetinaFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = detection.RetinaFaceDetection(
+            model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/tests/pipelines/test_retina_face_detection.py b/tests/pipelines/test_retina_face_detection.py
new file mode 100644
index 00000000..343e1c91
--- /dev/null
+++ b/tests/pipelines/test_retina_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class RetinaFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet50_face-detection_retinaface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/retina_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From adab7d3391c636818372697edc48dffb5f2d25d4 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 09:53:58 +0800
Subject: [PATCH 052/175] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EFE?=
 =?UTF-8?q?R=E4=BA=BA=E8=84=B8=E5=B1=9E=E6=80=A7=E8=AF=86=E5=88=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

完成Maas-cv CR自查；
新增个Task，已经跟产品确认可以增加，正在走流程中，目前还不在https://aone.alibaba-inc.com/v2/project/1181559/req#viewIdentifier=d7f112f9d023e2108fa1b0d8这里，后续会增加过来
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9976346
---
 .../images/facial_expression_recognition.jpg  |   3 +
 modelscope/metainfo.py                        |   2 +
 .../facial_expression_recognition/__init__.py |   0
 .../fer/__init__.py                           |   0
 .../fer/facial_expression_recognition.py      |  72 ++++++++++
 .../fer/transforms.py                         | 118 ++++++++++++++++
 .../facial_expression_recognition/fer/vgg.py  |  40 ++++++
 modelscope/outputs.py                         |   8 ++
 modelscope/pipelines/builder.py               |   3 +
 .../facial_expression_recognition_pipeline.py | 128 ++++++++++++++++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  20 +++
 .../test_facial_expression_recognition.py     |  36 +++++
 13 files changed, 431 insertions(+)
 create mode 100644 data/test/images/facial_expression_recognition.jpg
 create mode 100644 modelscope/models/cv/facial_expression_recognition/__init__.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/__init__.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/transforms.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/vgg.py
 create mode 100644 modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
 create mode 100644 tests/pipelines/test_facial_expression_recognition.py

diff --git a/data/test/images/facial_expression_recognition.jpg b/data/test/images/facial_expression_recognition.jpg
new file mode 100644
index 00000000..a943fa72
--- /dev/null
+++ b/data/test/images/facial_expression_recognition.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633
+size 161535
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 9638268c..47608d02 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
 
@@ -119,6 +120,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
new file mode 100644
index 00000000..c5eb71a1
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
@@ -0,0 +1,72 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from . import transforms
+from .vgg import VGG
+
+
+@MODELS.register_module(
+    Tasks.facial_expression_recognition, module_name=Models.fer)
+class FacialExpressionRecognition(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                                           ModelFile.CONFIGURATION)
+        self.net = VGG('VGG19', cfg_path=self.cfg_path)
+        self.load_model()
+        self.net = self.net.to(device)
+        self.transform_test = transforms.Compose([
+            transforms.TenCrop(44),
+            transforms.Lambda(lambda crops: torch.stack(
+                [transforms.ToTensor()(crop) for crop in crops])),
+        ])
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict['net'], strict=True)
+        self.net.eval()
+
+    def forward(self, input):
+        img = input['img']
+        img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY)
+        img = cv2.resize(img, (48, 48))
+        img = img[:, :, np.newaxis]
+        img = np.concatenate((img, img, img), axis=2)
+
+        img = Image.fromarray(np.uint8(img))
+        inputs = self.transform_test(img)
+
+        ncrops, c, h, w = inputs.shape
+
+        inputs = inputs.view(-1, c, h, w)
+        inputs = inputs.to(self.device)
+        inputs = Variable(inputs, volatile=True)
+        outputs = self.net(inputs)
+
+        outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops
+
+        score = F.softmax(outputs_avg)
+        _, predicted = torch.max(outputs_avg.data, 0)
+
+        return score, predicted
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/transforms.py b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
new file mode 100644
index 00000000..a1448c49
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
@@ -0,0 +1,118 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import numbers
+import types
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def to_tensor(pic):
+
+    # handle PIL Image
+    if pic.mode == 'I':
+        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'I;16':
+        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+    else:
+        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+    # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+    if pic.mode == 'YCbCr':
+        nchannel = 3
+    elif pic.mode == 'I;16':
+        nchannel = 1
+    else:
+        nchannel = len(pic.mode)
+    img = img.view(pic.size[1], pic.size[0], nchannel)
+    # put it from HWC to CHW format
+    # yikes, this transpose takes 80% of the loading time/CPU
+    img = img.transpose(0, 1).transpose(0, 2).contiguous()
+    if isinstance(img, torch.ByteTensor):
+        return img.float().div(255)
+    else:
+        return img
+
+
+def center_crop(img, output_size):
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    w, h = img.size
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return img.crop((j, i, j + tw, i + th))
+
+
+def five_crop(img, size):
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(
+            size) == 2, 'Please provide only two dimensions (h, w) for size.'
+
+    w, h = img.size
+    crop_h, crop_w = size
+    if crop_w > w or crop_h > h:
+        raise ValueError(
+            'Requested crop size {} is bigger than input size {}'.format(
+                size, (h, w)))
+    tl = img.crop((0, 0, crop_w, crop_h))
+    tr = img.crop((w - crop_w, 0, w, crop_h))
+    bl = img.crop((0, h - crop_h, crop_w, h))
+    br = img.crop((w - crop_w, h - crop_h, w, h))
+    center = center_crop(img, (crop_h, crop_w))
+    return (tl, tr, bl, br, center)
+
+
+class TenCrop(object):
+
+    def __init__(self, size, vertical_flip=False):
+        self.size = size
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            assert len(
+                size
+            ) == 2, 'Please provide only two dimensions (h, w) for size.'
+            self.size = size
+        self.vertical_flip = vertical_flip
+
+    def __call__(self, img):
+        first_five = five_crop(img, self.size)
+
+        if self.vertical_flip:
+            img = img.transpose(Image.FLIP_TOP_BOTTOM)
+        else:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+        second_five = five_crop(img, self.size)
+
+        return first_five + second_five
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+class ToTensor(object):
+
+    def __call__(self, pic):
+        return to_tensor(pic)
+
+
+class Lambda(object):
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img):
+        return self.lambd(img)
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/vgg.py b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
new file mode 100644
index 00000000..8120b6cc
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
@@ -0,0 +1,40 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from modelscope.utils.config import Config
+
+
+class VGG(nn.Module):
+
+    def __init__(self, vgg_name, cfg_path):
+        super(VGG, self).__init__()
+        model_cfg = Config.from_file(cfg_path)['models']
+        self.features = self._make_layers(model_cfg[vgg_name])
+        self.classifier = nn.Linear(512, 7)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = F.dropout(out, p=0.5, training=self.training)
+        out = self.classifier(out)
+        return out
+
+    def _make_layers(self, cfg):
+        layers = []
+        in_channels = 3
+        for x in cfg:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [
+                    nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+                    nn.BatchNorm2d(x),
+                    nn.ReLU(inplace=True)
+                ]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 8fe71ec2..50668693 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -85,6 +85,14 @@ TASK_OUTPUTS = {
     Tasks.face_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # facial expression recognition result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
+    #       "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
+    #   }
+    Tasks.facial_expression_recognition:
+    [OutputKeys.SCORES, OutputKeys.LABELS],
+
     # face recognition result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f6381857..6f901154 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -103,6 +103,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.facial_expression_recognition:
+    (Pipelines.facial_expression_recognition,
+     'damo/cv_vgg19_facial-expression-recognition_fer'),
     Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
                               'damo/cv_mobilenet_face-2d-keypoints_alignment'),
     Tasks.video_multi_modal_embedding:
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
new file mode 100644
index 00000000..4a80878c
--- /dev/null
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -0,0 +1,128 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \
+    FacialExpressionRecognition
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.facial_expression_recognition,
+    module_name=Pipelines.facial_expression_recognition)
+class FacialExpressionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        fer = FacialExpressionRecognition(model_path=ckpt_path, device=device)
+        self.fer = fer
+        self.device = device
+        logger.info('load model done')
+
+        # face detect pipeline
+        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.face_detection = pipeline(
+            Tasks.face_detection, model=det_model_id)
+
+    def _choose_face(self,
+                     det_result,
+                     min_face=10,
+                     top_face=1,
+                     center_face=False):
+        '''
+        choose face with maximum area
+        Args:
+            det_result: output of face detection pipeline
+            min_face: minimum size of valid face w/h
+            top_face: take faces with top max areas
+            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
+        '''
+        bboxes = np.array(det_result[OutputKeys.BOXES])
+        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
+        if bboxes.shape[0] == 0:
+            logger.info('Warning: No face detected!')
+            return None
+        # face idx with enough size
+        face_idx = []
+        for i in range(bboxes.shape[0]):
+            box = bboxes[i]
+            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
+                face_idx += [i]
+        if len(face_idx) == 0:
+            logger.info(
+                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
+            )
+            return None
+        bboxes = bboxes[face_idx]
+        landmarks = landmarks[face_idx]
+        # find max faces
+        boxes = np.array(bboxes)
+        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        sort_idx = np.argsort(area)[-top_face:]
+        # find center face
+        if top_face > 1 and center_face and bboxes.shape[0] > 1:
+            img_center = [img.shape[1] // 2, img.shape[0] // 2]
+            min_dist = float('inf')
+            sel_idx = -1
+            for _idx in sort_idx:
+                box = boxes[_idx]
+                dist = np.square(
+                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
+                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
+                if dist < min_dist:
+                    min_dist = dist
+                    sel_idx = _idx
+            sort_idx = [sel_idx]
+        main_idx = sort_idx[-1]
+        return bboxes[main_idx], landmarks[main_idx]
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img[:, :, ::-1]
+        det_result = self.face_detection(img.copy())
+        rtn = self._choose_face(det_result)
+        face_img = None
+        if rtn is not None:
+            _, face_lmks = rtn
+            face_lmks = face_lmks.reshape(5, 2)
+            face_img, _ = align_face(img, (112, 112), face_lmks)
+            face_img = face_img.astype(np.float32)
+        result = {}
+        result['img'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.fer(input)
+        assert result is not None
+        scores = result[0].tolist()
+        labels = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 1b738bfe..32185fb9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -20,6 +20,7 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_recognition = 'face-recognition'
+    facial_expression_recognition = 'facial-expression-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index ea1d95b5..cb07ba1a 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -89,6 +89,26 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_facial_expression_result(img_path, facial_expression_result):
+    label_idx = facial_expression_result[OutputKeys.LABELS]
+    map_list = [
+        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
+    ]
+    label = map_list[label_idx]
+
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    cv2.putText(
+        img,
+        'facial expression: {}'.format(label), (10, 10),
+        1,
+        1.0, (0, 255, 0),
+        thickness=1,
+        lineType=8)
+    print('facial expression: {}'.format(label))
+    return img
+
+
 def draw_face_detection_result(img_path, detection_result):
     bboxes = np.array(detection_result[OutputKeys.BOXES])
     kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py
new file mode 100644
index 00000000..fff83ad6
--- /dev/null
+++ b/tests/pipelines/test_facial_expression_recognition.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_facial_expression_result
+from modelscope.utils.test_utils import test_level
+
+
+class FacialExpressionRecognitionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_vgg19_facial-expression-recognition_fer'
+
+    def show_result(self, img_path, facial_expression_result):
+        img = draw_facial_expression_result(img_path, facial_expression_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        fer = pipeline(
+            Tasks.facial_expression_recognition, model=self.model_id)
+        img_path = 'data/test/images/facial_expression_recognition.jpg'
+        result = fer(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3e92dac3283839fef9e9e9adbc1a9c7edbe5c714 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 5 Sep 2022 09:55:26 +0800
Subject: [PATCH 053/175] [to #42322933]lazy load activate for shop
 segmentation         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10009052

---
 .../models/cv/shop_segmentation/__init__.py   | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py
index b40a0760..072628bd 100644
--- a/modelscope/models/cv/shop_segmentation/__init__.py
+++ b/modelscope/models/cv/shop_segmentation/__init__.py
@@ -1 +1,20 @@
-from .shop_seg_base import SHOPSEG
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .shop_seg_base import SHOPSEG
+
+else:
+    _import_structure = {'shop_seg_base': ['SHOPSEG']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )

From a9c14e4eadd64e30820b689b47f5e2ebc19516f4 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 5 Sep 2022 11:07:48 +0800
Subject: [PATCH 054/175] [to #42322933] Support saving the best checkpoint for
 inference

1. Support saving the best checkpoint for inference
2. Fix a bug that _max_iters field does not exist in trainer
3. Fix a bug that function in lambda_lr field cannot be saved to file
4. Fix a bug that save_pretrained would not be called by iterating
5. Fix a bug that interval is not passed from BestCkptHook's init
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9972765
---
 modelscope/trainers/hooks/checkpoint_hook.py  | 40 ++++++++++---------
 modelscope/trainers/hooks/hook.py             |  4 +-
 modelscope/utils/checkpoint.py                | 17 +++++---
 modelscope/utils/config.py                    |  3 ++
 .../trainers/test_finetune_text_generation.py | 22 +++++-----
 5 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index cf7a0f7a..fcd8e982 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -27,7 +27,7 @@ class CheckpointHook(Hook):
         save_last (bool): Whether to save the last checkpoint. Default: True.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
 
     def __init__(self,
                  interval=0,
@@ -75,25 +75,27 @@ class CheckpointHook(Hook):
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
-        self._save_pretrained(trainer)
+        if (self.is_last_epoch(trainer)
+                and self.by_epoch) or (self.is_last_iter(trainer)
+                                       and not self.by_epoch):
+            self._save_pretrained(trainer)
 
     def _save_pretrained(self, trainer):
-        if self.is_last_epoch(trainer) and self.by_epoch:
-            output_dir = os.path.join(self.save_dir,
-                                      ModelFile.TRAIN_OUTPUT_DIR)
-            from modelscope.trainers.parallel.utils import is_parallel
+        output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        from modelscope.trainers.parallel.utils import is_parallel
 
-            if is_parallel(trainer.model):
-                model = trainer.model.module
-            else:
-                model = trainer.model
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
 
-            if hasattr(model, 'save_pretrained'):
-                model.save_pretrained(
-                    output_dir,
-                    ModelFile.TORCH_MODEL_BIN_FILE,
-                    save_function=save_checkpoint,
-                    config=trainer.cfg.to_dict())
+        if hasattr(model, 'save_pretrained'):
+            model.save_pretrained(
+                output_dir,
+                ModelFile.TORCH_MODEL_BIN_FILE,
+                save_function=save_checkpoint,
+                config=trainer.cfg.to_dict(),
+                with_meta=False)
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -133,7 +135,7 @@ class BestCkptSaverHook(CheckpointHook):
         save_dir (str): Output directory to save best checkpoint.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
     rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y}
 
     def __init__(self,
@@ -141,9 +143,11 @@ class BestCkptSaverHook(CheckpointHook):
                  rule='max',
                  by_epoch=True,
                  save_optimizer=True,
-                 save_dir=None):
+                 save_dir=None,
+                 interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
+            interval=interval,
             by_epoch=by_epoch,
             save_optimizer=save_optimizer,
             save_dir=save_dir,
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 75cc226c..1c567f1c 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -199,14 +199,14 @@ class Hook:
         Whether to reach the last epoch
         Returns: bool
         """
-        return trainer.epoch + 1 == trainer._max_epochs
+        return trainer.epoch + 1 == trainer.max_epochs
 
     def is_last_iter(self, trainer):
         """
         Whether to reach the last iteration in the entire training process
         Returns: bool
         """
-        return trainer.iter + 1 == trainer._max_iters
+        return trainer.iter + 1 == trainer.max_iters
 
     def get_triggered_stages(self):
         trigger_stages = set()
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 8b9d027a..425d3312 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -40,7 +40,8 @@ def weights_to_cpu(state_dict):
 def save_checkpoint(model: torch.nn.Module,
                     filename: str,
                     optimizer: Optional[Optimizer] = None,
-                    meta: Optional[dict] = None) -> None:
+                    meta: Optional[dict] = None,
+                    with_meta: bool = True) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
@@ -65,10 +66,14 @@ def save_checkpoint(model: torch.nn.Module,
         # save class name to the meta
         meta.update(CLASSES=model.CLASSES)
 
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(model.state_dict())
-    }
+    if with_meta:
+        checkpoint = {
+            'meta': meta,
+            'state_dict': weights_to_cpu(model.state_dict())
+        }
+    else:
+        checkpoint = weights_to_cpu(model.state_dict())
+
     # save optimizer state dict in the checkpoint
     if isinstance(optimizer, Optimizer):
         checkpoint['optimizer'] = optimizer.state_dict()
@@ -141,7 +146,7 @@ def save_pretrained(model,
 
     # Save the ckpt to the save directory
     try:
-        save_function(model, output_ckpt_path)
+        save_function(model, output_ckpt_path, **kwargs)
     except Exception as e:
         raise Exception(
             f'During saving checkpoints, the error of "{type(e).__name__} '
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 42985db6..7d972118 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -9,6 +9,7 @@ import sys
 import tempfile
 import types
 from pathlib import Path
+from types import FunctionType
 from typing import Dict, Union
 
 import addict
@@ -638,6 +639,8 @@ class JSONIteratorEncoder(json.JSONEncoder):
     """
 
     def default(self, obj):
+        if isinstance(obj, FunctionType):
+            return None
         try:
             iterable = iter(obj)
         except TypeError:
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
index 8cdfdf01..a561effe 100644
--- a/tests/trainers/test_finetune_text_generation.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -128,15 +128,14 @@ class TestFinetuneTextGeneration(unittest.TestCase):
 
     @unittest.skip
     def test_finetune_cnndm(self):
-        from datasets import load_dataset
-        dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0')
-        train_dataset = dataset_dict['train'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        eval_dataset = dataset_dict['validation'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        num_warmup_steps = 2000
+        from modelscope.msdatasets import MsDataset
+        dataset_dict = MsDataset.load('dureader_robust_qg')
+        train_dataset = dataset_dict['train'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        eval_dataset = dataset_dict['validation'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        num_warmup_steps = 200
+        os.environ['LOCAL_RANK'] = '0'
 
         def noam_lambda(current_step: int):
             current_step += 1
@@ -154,12 +153,11 @@ class TestFinetuneTextGeneration(unittest.TestCase):
             return cfg
 
         kwargs = dict(
-            model=self.model_id,
+            model='damo/nlp_palm2.0_text-generation_chinese-base',
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             work_dir=self.tmp_dir,
-            cfg_modify_fn=cfg_modify_fn,
-            model_revision='beta')
+            cfg_modify_fn=cfg_modify_fn)
         trainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()

From b870e4eed541405380c6bbca78e44a06f947aae7 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Mon, 5 Sep 2022 13:26:30 +0800
Subject: [PATCH 055/175] [to #42322933] test: use custom config to reduce test
 time         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011826
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

、
---
 modelscope/models/audio/ans/complex_nn.py |  6 +++---
 modelscope/models/audio/ans/unet.py       |  5 +++--
 tests/trainers/audio/test_ans_trainer.py  | 10 +++++++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index c61446c2..9768eff7 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,7 +1,7 @@
 """
-class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of
-Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
-from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
+ here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 
 """
 import torch
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index ae66eb69..3a9c5549 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,6 +1,7 @@
 """
-Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
-from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+The implementation here is modified based on
+ Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 """
 import torch
 import torch.nn as nn
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index 176c811f..ed8cd1fe 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -8,12 +8,14 @@ from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 SEGMENT_LENGTH_TEST = 640
 
 
 class TestANSTrainer(unittest.TestCase):
+    REVISION = 'beta'
 
     def setUp(self):
         self.tmp_dir = tempfile.TemporaryDirectory().name
@@ -21,6 +23,11 @@ class TestANSTrainer(unittest.TestCase):
             os.makedirs(self.tmp_dir)
 
         self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        cfg = read_config(self.model_id, revision=self.REVISION)
+        cfg.train.max_epochs = 2
+        cfg.train.dataloader.batch_size_per_gpu = 1
+        self.cfg_file = os.path.join(self.tmp_dir, 'train_config.json')
+        cfg.dump(self.cfg_file)
 
         hf_ds = MsDataset.load(
             'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
@@ -39,12 +46,13 @@ class TestANSTrainer(unittest.TestCase):
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
-            model_revision='beta',
+            model_revision=self.REVISION,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
             max_epochs=2,
             train_iters_per_epoch=2,
             val_iters_per_epoch=1,
+            cfg_file=self.cfg_file,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(

From c25e60c67dc7891a21065e912b30e276c77ccf7e Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 13:52:54 +0800
Subject: [PATCH 056/175] [to #42322933]add lazy load         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011795
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * [to #42322933] 新增FER人脸属性识别
---
 .../facial_expression_recognition/__init__.py | 20 +++++++++++++++++++
 .../fer/__init__.py                           |  2 ++
 modelscope/pipelines/cv/__init__.py           |  4 ++++
 .../facial_expression_recognition_pipeline.py |  2 +-
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py
index e69de29b..35a15d18 100644
--- a/modelscope/models/cv/facial_expression_recognition/__init__.py
+++ b/modelscope/models/cv/facial_expression_recognition/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .fer import FacialExpressionRecognition
+
+else:
+    _import_structure = {'fer': ['FacialExpressionRecognition']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
index e69de29b..2546035b 100644
--- a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .facial_expression_recognition import FacialExpressionRecognition
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index d3dba978..ac1ed82c 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -47,6 +47,8 @@ if TYPE_CHECKING:
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -105,6 +107,8 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'facial_expression_recognition_pipelin':
+        ['FacialExpressionRecognitionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index 4a80878c..c5577dcf 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -8,7 +8,7 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_recognition.align_face import align_face
-from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \
+from modelscope.models.cv.facial_expression_recognition import \
     FacialExpressionRecognition
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline

From f4ca0b8aabe916d010f62dab685625cd3c84c28a Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 15:54:57 +0800
Subject: [PATCH 057/175] [to #42322933]add lazy import         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10012143

---
 .../models/cv/face_detection/__init__.py      | 22 +++++++++++++++++++
 .../cv/face_detection/retinaface/__init__.py  |  1 +
 modelscope/pipelines/cv/__init__.py           |  2 ++
 .../cv/retina_face_detection_pipeline.py      |  7 ++++--
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index e69de29b..a3c47164 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .retinaface import RetinaFaceDetection
+
+else:
+    _import_structure = {
+        'retinaface': ['RetinaFaceDetection'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
index e69de29b..779aaf1c 100644
--- a/modelscope/models/cv/face_detection/retinaface/__init__.py
+++ b/modelscope/models/cv/face_detection/retinaface/__init__.py
@@ -0,0 +1 @@
+from .detection import RetinaFaceDetection
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index ac1ed82c..960ed621 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -47,6 +47,7 @@ if TYPE_CHECKING:
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
 
 else:
@@ -107,6 +108,7 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
         ['FacialExpressionRecognitionPipeline']
     }
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
index 20111c11..b8c64405 100644
--- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -1,10 +1,13 @@
 import os.path as osp
 from typing import Any, Dict
 
+import cv2
 import numpy as np
+import PIL
+import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.face_detection.retinaface import detection
+from modelscope.models.cv.face_detection import RetinaFaceDetection
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -28,7 +31,7 @@ class RetinaFaceDetectionPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
         logger.info(f'loading model from {ckpt_path}')
-        detector = detection.RetinaFaceDetection(
+        detector = RetinaFaceDetection(
             model_path=ckpt_path, device=self.device)
         self.detector = detector
         logger.info('load model done')

From 042cff7d68dce03f12a010d8b3723395fccde998 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 5 Sep 2022 16:08:50 +0800
Subject: [PATCH 058/175] [to #44702084]fix: ci pip install domain in single
 commands, find with requirement install failed is complicated.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10014958

    * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated.
---
 .dev_scripts/ci_container_test.sh | 10 +++++-----
 .dev_scripts/citest.sh            | 19 -------------------
 tests/run_config.yaml             |  5 +----
 3 files changed, 6 insertions(+), 28 deletions(-)
 delete mode 100644 .dev_scripts/citest.sh

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index a53c08c6..194a48b3 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,8 +1,8 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
 
 git config --global --add safe.directory /Maas-lib
diff --git a/.dev_scripts/citest.sh b/.dev_scripts/citest.sh
deleted file mode 100644
index c6e0905f..00000000
--- a/.dev_scripts/citest.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-
-pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
-
-# linter test
-# use internal project for pre-commit due to the network problem
-pre-commit run --all-files
-if [ $? -ne 0 ]; then
-    echo "linter test failed, please run 'pre-commit run --all-files' to check"
-    exit -1
-fi
-
-PYTHONPATH=. python tests/run.py
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index 591dcd66..f44053f6 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -1,7 +1,4 @@
-# envs option allows fine-grained control for test executoin, for example,
-# python tests/run.py --env pytorch
-# would only trigger exeutions of all pytorch cases.
-# envs option defaults to None for backward compatbility
+# isolate cases in env, we can install different dependencies in each env.
 isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
   - test_text_to_speech.py
   - test_multi_modal_embedding.py

From f660a119f02cbf767521eea322f96faf2bb883c8 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Mon, 5 Sep 2022 16:19:45 +0800
Subject: [PATCH 059/175] [to #42322933]Add resumable and large data upload.

CR Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9995250

1. add resumable dataset upload
2. add large data upload (up to 48.8TB)
---
 modelscope/msdatasets/ms_dataset.py         |  8 +------
 modelscope/msdatasets/utils/oss_utils.py    | 24 +++++++++++++++------
 modelscope/msdatasets/utils/upload_utils.py | 22 +++++++++----------
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 338c6333..28a95643 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -574,14 +574,8 @@ class MsDataset:
             None
 
         """
-        from modelscope.hub.api import HubApi
-        _hub_api = HubApi()
-        cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
         _upload_manager = DatasetUploadManager(
-            dataset_name=dataset_name,
-            namespace=namespace,
-            version=version,
-            cookies=cookies)
+            dataset_name=dataset_name, namespace=namespace, version=version)
         _upload_manager.upload(object_name, local_file_path)
 
     @staticmethod
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 63a1cf77..9a7040a1 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -18,6 +18,12 @@ class OssUtilities:
         self.oss_dir = oss_config['Dir']
         self.oss_backup_dir = oss_config['BackupDir']
 
+        self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset'
+        self.upload_multipart_threshold = 50 * 1024 * 1024
+        self.upload_part_size = 1 * 1024 * 1024
+        self.upload_num_threads = 4
+        self.upload_max_retries = 3
+
     @staticmethod
     def _percentage(consumed_bytes, total_bytes):
         if total_bytes:
@@ -42,21 +48,27 @@ class OssUtilities:
                 progress_callback=self._percentage)
         return local_path
 
-    def upload(self, oss_file_name: str, local_file_path: str) -> str:
-        max_retries = 3
+    def upload(self, oss_object_name: str, local_file_path: str) -> str:
         retry_count = 0
-        object_key = os.path.join(self.oss_dir, oss_file_name)
+        object_key = os.path.join(self.oss_dir, oss_object_name)
+        resumable_store = oss2.ResumableStore(
+            root=self.upload_resumable_tmp_store)
 
         while True:
             try:
                 retry_count += 1
-                self.bucket.put_object_from_file(
+                oss2.resumable_upload(
+                    self.bucket,
                     object_key,
                     local_file_path,
-                    progress_callback=self._percentage)
+                    store=resumable_store,
+                    multipart_threshold=self.upload_multipart_threshold,
+                    part_size=self.upload_part_size,
+                    progress_callback=self._percentage,
+                    num_threads=self.upload_num_threads)
                 break
             except Exception:
-                if retry_count >= max_retries:
+                if retry_count >= self.upload_max_retries:
                     raise
 
         return object_key
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index eff3aca0..fbe5c531 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,23 +1,21 @@
-from http.cookiejar import CookieJar
-
 from .oss_utils import OssUtilities
 
 
 class DatasetUploadManager(object):
 
-    def __init__(self, dataset_name: str, namespace: str, version: str,
-                 cookies: CookieJar):
+    def __init__(self, dataset_name: str, namespace: str, version: str):
         from modelscope.hub.api import HubApi
-        api = HubApi()
-        oss_config = api.get_dataset_access_config_session(
-            cookies=cookies,
+        _hub_api = HubApi()
+        _cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
+        _oss_config = _hub_api.get_dataset_access_config_session(
+            cookies=_cookies,
             dataset_name=dataset_name,
             namespace=namespace,
             revision=version)
 
-        self.oss_utilities = OssUtilities(oss_config)
+        self.oss_utilities = OssUtilities(_oss_config)
 
-    def upload(self, oss_file_name: str, local_file_path: str) -> str:
-        oss_object_key = self.oss_utilities.upload(
-            oss_file_name=oss_file_name, local_file_path=local_file_path)
-        return oss_object_key
+    def upload(self, object_name: str, local_file_path: str) -> str:
+        object_key = self.oss_utilities.upload(
+            oss_object_name=object_name, local_file_path=local_file_path)
+        return object_key

From 4484dcaa04ca49b7e90954b032118922ee7811ba Mon Sep 17 00:00:00 2001
From: "liangting.zl" <liangting.zl@alibaba-inc.com>
Date: Mon, 5 Sep 2022 16:42:40 +0800
Subject: [PATCH 060/175] [to #42322933]  feat: add hand keypoints pipeline    
     Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9961906

    * feat: add hand keypoints pipeline
---
 data/test/images/hand_keypoints.jpg           |  3 ++
 modelscope/metainfo.py                        |  1 +
 modelscope/outputs.py                         | 15 ++++++
 modelscope/pipelines/builder.py               |  3 ++
 modelscope/pipelines/cv/__init__.py           |  2 +
 .../cv/hand_2d_keypoints_pipeline.py          | 51 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 tests/pipelines/test_hand_2d_keypoints.py     | 45 ++++++++++++++++
 8 files changed, 121 insertions(+)
 create mode 100644 data/test/images/hand_keypoints.jpg
 create mode 100644 modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
 create mode 100644 tests/pipelines/test_hand_2d_keypoints.py

diff --git a/data/test/images/hand_keypoints.jpg b/data/test/images/hand_keypoints.jpg
new file mode 100644
index 00000000..cb445c26
--- /dev/null
+++ b/data/test/images/hand_keypoints.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
+size 7750
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 47608d02..3ac2f2df 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -112,6 +112,7 @@ class Pipelines(object):
     hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
     body_3d_keypoints = 'canonical_body-3d-keypoints_video'
+    hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
     easycv_detection = 'easycv-detection'
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 50668693..c6a7a619 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -219,6 +219,21 @@ TASK_OUTPUTS = {
     # }
     Tasks.body_3d_keypoints: [OutputKeys.POSES],
 
+    # 2D hand keypoints result for single sample
+    # {
+    #     "keypoints": [
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                  ],
+    #     "boxes": [
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
+
     # video single object tracking result for single video
     # {
     #   "boxes": [
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 6f901154..9f265fb8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -99,6 +99,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                               'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
     Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
                               'damo/cv_canonical_body-3d-keypoints_video'),
+    Tasks.hand_2d_keypoints:
+    (Pipelines.hand_2d_keypoints,
+     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
     Tasks.face_detection: (Pipelines.face_detection,
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 960ed621..72a225ff 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
+    from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
     from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
@@ -57,6 +58,7 @@ else:
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
+        'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
         'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..db66f5d2
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .easycv_pipelines.base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints)
+class Hand2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for hand pose keypoint task."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        self.model_dir = model
+        super(Hand2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def _build_predict_op(self):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+        detection_predictor_type = self.cfg['DETECTION']['type']
+        detection_model_path = os.path.join(
+            self.model_dir, self.cfg['DETECTION']['model_path'])
+        detection_cfg_file = os.path.join(self.model_dir,
+                                          self.cfg['DETECTION']['config_file'])
+        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
+        self.cfg.pipeline.predictor_config[
+            'detection_predictor_config'] = dict(
+                type=detection_predictor_type,
+                model_path=detection_model_path,
+                config_file=detection_cfg_file,
+                score_threshold=detection_score_threshold)
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config
+        })
+        return pipeline_op
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 32185fb9..47d38dd7 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -27,6 +27,7 @@ class CVTasks(object):
     face_image_generation = 'face-image-generation'
     body_2d_keypoints = 'body-2d-keypoints'
     body_3d_keypoints = 'body-3d-keypoints'
+    hand_2d_keypoints = 'hand-2d-keypoints'
     general_recognition = 'general-recognition'
 
     image_classification = 'image-classification'
diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py
new file mode 100644
index 00000000..86cd2d06
--- /dev/null
+++ b/tests/pipelines/test_hand_2d_keypoints.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class Hand2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_hand_2d_keypoints(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+        model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_hand_2d_keypoints_with_default_model(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 83dbf713020b7c45cd22b0ebcc366eb73ec5d899 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 5 Sep 2022 17:38:05 +0800
Subject: [PATCH 061/175] [to #44702084]fix: ci pip install domain in single
 commands, find with requirement install failed is complicated.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10019738

    * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated.
---
 .dev_scripts/ci_container_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 194a48b3..129a6c25 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,4 +1,4 @@
-awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html

From 3d3f9b45377abad27b9e9272ee294a2f2ee50ea9 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Mon, 5 Sep 2022 17:51:22 +0800
Subject: [PATCH 062/175] [to #42322933] fix checkpoint format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复 palm，gpt3，mplug 模型存在的 finetune 后保存 checkpoint 与原有 checkpoint key 字段存在区别无法使用 from_pretrained 导入的问题
2. 调整 test_finetune_mplug.py 为只保存训练结束时的 checkpoint，减少 ci 耗时
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10016517
---
 .../multi_modal/mplug/modeling_mplug.py       | 10 +++---
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |  4 +++
 .../models/nlp/palm_v2/modeling_palm.py       | 16 ++++-----
 tests/trainers/test_finetune_mplug.py         | 33 ++++++++++---------
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 78f60f9b..f469c218 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel):
                                            ModelFile.TORCH_MODEL_BIN_FILE)
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
-                state_dict = checkpoint['model']
-            else:
-                state_dict = checkpoint['module']
+                checkpoint = checkpoint['model']
+            checkpoint = {
+                k.replace('model.', ''): v
+                for k, v in checkpoint.items()
+            }
 
-            msg = model.load_state_dict(state_dict, strict=False)
+            msg = model.load_state_dict(checkpoint, strict=False)
             print('load checkpoint from %s' % checkpoint_path)
             print(msg)
         return model
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 4e30f697..69e9ba7c 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel):
         state_dict_file = os.path.join(pretrained_model_name_or_path,
                                        ModelFile.TORCH_MODEL_BIN_FILE)
         state_dict = torch.load(state_dict_file)
+        state_dict = {
+            k.replace('model.language_model', 'language_model'): v
+            for k, v in state_dict.items()
+        }
         model.load_state_dict(state_dict)
         return model
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index ff6fd732..99b00454 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
         self.generator.dense.weight = self.decoder.embeddings.weight
 
         if checkpoint is not None:
-            for key in list(checkpoint['model'].keys()):
-                checkpoint['model'][key.replace('module.',
-                                                '')] = checkpoint['model'][key]
-            msg = self.load_state_dict(checkpoint['model'], strict=False)
-            print(msg)
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            for key in list(checkpoint.keys()):
+                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
+            self.load_state_dict(checkpoint, strict=False)
         else:
             for module in self.decoder.modules():
                 if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
         return addict.Dict(loss=loss)
 
 
-class Translator(nn.Module):
+class Translator(object):
     """
     Uses a model to translate a batch of sentences.
     """
@@ -1298,8 +1298,8 @@ class Translator(nn.Module):
 
         return results
 
-    def forward(self, input_ids: torch.Tensor,
-                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+    def __call__(self, input_ids: torch.Tensor,
+                 attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
         batch = self.Batch(
             batch_size=input_ids.size()[0],
             src=input_ids,
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index b46dbf45..72196fba 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -41,6 +41,18 @@ class TestFinetuneMPlug(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    def _cfg_modify_fn(self, cfg):
+        cfg.train.hooks = [{
+            'type': 'CheckpointHook',
+            'interval': self.max_epochs
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }]
+        return cfg
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_with_caption(self):
         kwargs = dict(
@@ -48,15 +60,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_caption_with_model_and_args(self):
@@ -86,15 +95,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_vqa_with_model_and_args(self):
@@ -124,15 +130,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_retrieval_with_model_and_args(self):

From e365023862995b921f74d902a69667933fa58060 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Mon, 5 Sep 2022 19:36:46 +0800
Subject: [PATCH 063/175] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dprocessor=E8=BE=93?=
 =?UTF-8?q?=E5=87=BAtorch.tensor=E6=97=B6=E8=A2=AB=E8=BD=AC=E4=B8=BAnumpy?=
 =?UTF-8?q?=E7=9A=84=E5=BC=82=E5=B8=B8=20=20=20=20=20=20=20=20=20Link:=20h?=
 =?UTF-8?q?ttps://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/100218?=
 =?UTF-8?q?02?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * fix to_torch_dataset
---
 modelscope/msdatasets/ms_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 28a95643..691db4fe 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset):
         for idx in range(iter_start, iter_end):
             item_dict = self.dataset[idx]
             res = {
-                k: np.array(item_dict[k])
+                k: torch.tensor(item_dict[k])
                 for k in self.columns if k in self.retained_columns
             }
             for preprocessor in self.preprocessor_list:
                 res.update({
-                    k: np.array(v)
+                    k: torch.tensor(v)
                     for k, v in preprocessor(item_dict).items()
                     if k in self.retained_columns
                 })

From 904374d329a648a9d4e587fdb7fc2c94ebcbb816 Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Mon, 5 Sep 2022 20:58:08 +0800
Subject: [PATCH 064/175] [to #42322933] feat: plug inference         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9931748

---
 modelscope/metainfo.py                        |    2 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/plug/__init__.py        |   27 +
 .../models/nlp/plug/configuration_plug.py     |  232 ++++
 .../models/nlp/plug/distributed_plug.py       |  191 +++
 modelscope/models/nlp/plug/modeling_plug.py   | 1054 +++++++++++++++++
 modelscope/pipelines/base.py                  |  108 ++
 .../nlp/distributed_plug_pipeline.py          |  107 ++
 modelscope/preprocessors/nlp.py               |    3 +-
 modelscope/trainers/trainer.py                |    7 +-
 modelscope/utils/nlp/distributed.py           |  130 ++
 modelscope/utils/nlp/load_checkpoint.py       |  117 ++
 modelscope/utils/torch_utils.py               |   21 +-
 requirements/nlp.txt                          |    2 +
 tests/pipelines/test_plug_text_generation.py  |   49 +
 15 files changed, 2044 insertions(+), 8 deletions(-)
 create mode 100644 modelscope/models/nlp/plug/__init__.py
 create mode 100644 modelscope/models/nlp/plug/configuration_plug.py
 create mode 100644 modelscope/models/nlp/plug/distributed_plug.py
 create mode 100644 modelscope/models/nlp/plug/modeling_plug.py
 create mode 100644 modelscope/pipelines/nlp/distributed_plug_pipeline.py
 create mode 100755 modelscope/utils/nlp/distributed.py
 create mode 100755 modelscope/utils/nlp/load_checkpoint.py
 create mode 100644 tests/pipelines/test_plug_text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 3ac2f2df..792bd708 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -55,6 +55,7 @@ class Models(object):
     lcrf = 'lstm-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
 
     # audio models
@@ -172,6 +173,7 @@ class Pipelines(object):
     dialog_state_tracking = 'dialog-state-tracking'
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
+    plug_generation = 'plug-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
     relation_extraction = 'relation-extraction'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index fd61e40b..9d54834c 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -28,6 +28,7 @@ if TYPE_CHECKING:
                               SingleBackboneTaskModelBase)
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
+    from .plug import PlugForTextGeneration
     from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
 
 else:
@@ -60,6 +61,7 @@ else:
         ],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'gpt3': ['GPT3ForTextGeneration'],
+        'plug': ['PlugForTextGeneration'],
         'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
     }
 
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
new file mode 100644
index 00000000..b74258a4
--- /dev/null
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_plug import PlugNLGConfig
+    from .modeling_plug import PlugModel
+    from .distributed_plug import DistributedPlug
+    from .plug_for_text_generation import PlugForTextGeneration
+else:
+    _import_structure = {
+        'configuration_plug': ['PlugNLGConfig'],
+        'modeling_plug': ['PlugModel'],
+        'distributed_plug': ['DistributedPlug'],
+        'plug_for_text_generation': ['PlugForTextGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
new file mode 100644
index 00000000..64807392
--- /dev/null
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -0,0 +1,232 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import json
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PlugNLUConfig(PretrainedConfig):
+    model_type = 'plugNLU'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 original_vocab_size=21128,
+                 hidden_size=8192,
+                 num_hidden_layers=24,
+                 num_attention_heads=128,
+                 intermediate_size=32768,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=2048,
+                 type_vocab_size=3,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.0333,
+                 pre_ln=True,
+                 fp16=True,
+                 fp32_layernorm=True,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-5,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.original_vocab_size = original_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = PlugNLUConfig()
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def merge_args(self, args):
+        """merge values a `BertConfig` from a json file of parameters."""
+        local_keys = self.__dict__.keys()
+        for key, value in args.__dict__.items():
+            if key in local_keys:
+                continue
+            self.__dict__[key] = value
+        return self
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+
+class PlugNLGConfig(PlugNLUConfig):
+    model_type = 'plugNLG'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.01,
+                 pre_ln=False,
+                 fp16=False,
+                 fp32_layernorm=False,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-12,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
new file mode 100644
index 00000000..2992f595
--- /dev/null
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -0,0 +1,191 @@
+import os
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from megatron import mpu
+from megatron.fp16 import FP16_Module
+from megatron.utils import print_rank_0
+
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.nlp.load_checkpoint import pre_load
+from modelscope.utils.torch_utils import set_random_seed_mpu
+from . import PlugModel
+from .configuration_plug import PlugNLGConfig
+
+logger = get_logger(__name__)
+
+
+class DistributedPlug(TorchModel):
+
+    def __init__(self, model_dir, rank, **kwargs):
+        super().__init__(model_dir, **kwargs)
+        self.rank = rank
+        self.model_cfg = kwargs
+        self.config = PlugNLGConfig.from_pretrained(model_dir)
+        initialize_distributed(rank, mpu, kwargs['world_size'],
+                               kwargs['model_parallel_size'],
+                               kwargs['master_ip'], kwargs['master_port'])
+        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        set_random_seed_mpu(seed)
+        self.iteration = 0
+        self.dist_model = self.initialize_model(path_load_tag='model')
+
+    def initialize_model(self, path_load_tag='model'):
+        """Build the model."""
+        print_rank_0('Building Plug model. It will take a few minutes ...')
+        model = PlugModel(self.config)
+
+        if mpu.get_data_parallel_rank() == 0:
+            logger.info(
+                ' > number of parameters on model parallel rank {}: {}'.format(
+                    mpu.get_model_parallel_rank(),
+                    sum([p.nelement() for p in model.parameters()])))
+
+        if self.config.deepspeed and self.config.fp16:
+            model.half()
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if self.config.fp16:
+            model = FP16_Module(model)
+            if self.config.fp32_embedding:
+                model.module.model.bert.embeddings.word_embeddings.float()
+                model.module.model.bert.embeddings.position_embeddings.float()
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_tokentypes:
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_layernorm:
+                for name, _module in model.named_modules():
+                    if 'LayerNorm' in name:
+                        _module.float()
+
+        load_model = pre_load(mpu, self.model_dir, tag=path_load_tag)
+        model_dict = model.module.model.state_dict()
+        for key in load_model:
+            if key not in model_dict.keys():
+                print_rank_0('Skip key: ' + key)
+            else:
+                print_rank_0('Loading key: ' + key)
+        model.module.model.load_state_dict(load_model, strict=False)
+        return model
+
+    @staticmethod
+    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+        # This function has been mostly taken from huggingface conversational ai code at
+        # https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+        # conversational-ai-with-transfer-learning-2d818ac26313
+
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p > 0.0:
+            # convert to 1D
+            logits = logits.view(logits.size()[1]).contiguous()
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = filter_value
+            # going back to 2D
+            logits = logits.view(1, -1).contiguous()
+        return logits
+
+    def generate(self, input: Dict[str, Tensor], out_length=128, *kwargs):
+        device = torch.cuda.current_device()
+        batch_size = input['input_ids'].shape[0]
+        tokens = input['input_ids'].view(1, -1).contiguous().to(device)
+        dec_input_ids = input['dec_input_ids'].to(device)
+        attention_mask = input['attention_mask'].to(device)
+        self.dist_model.eval()
+        with torch.no_grad():
+            # Only supports batch_size=1
+            all_generate_tokens = []
+            generate_tokens = []
+            counter = 0
+            sequence_output = None
+            vocab_size = self.config.original_vocab_size
+            sep_token_idx = 102  # index of [SEP] token in BertTokenizer
+            while counter < out_length:
+                if counter % 128 == 0 and counter != 0:
+                    # Sliding window
+                    generate_tokens.append(sep_token_idx)
+                    start = (tokens == sep_token_idx).nonzero(
+                        as_tuple=True)[-1]
+                    if start + len(generate_tokens) >= 512:
+                        tokens = torch.cat([
+                            tokens[:start],
+                            torch.cuda.LongTensor(generate_tokens)
+                        ], -1)[-512:]
+                    else:
+                        tokens[0][start:start + len(generate_tokens
+                                                    )] = torch.cuda.LongTensor(
+                                                        generate_tokens)
+
+                    attention_mask = (tokens != 0)
+                    dec_input_ids = input['dec_input_ids'].to(device)
+                    generate_tokens = []
+                    sequence_output = None
+
+                position_ids = torch.full([batch_size, 1],
+                                          len(generate_tokens),
+                                          dtype=torch.long,
+                                          device=device)
+                _, logits, sequence_output = self.dist_model(
+                    tokens,
+                    None,
+                    attention_mask,
+                    dec_input_ids,
+                    attention_mask,
+                    position_ids,
+                    is_infer=True,
+                    sequence_output=sequence_output,
+                    parallel_output=False)
+                logits = logits[:, -1, :]
+                logits = logits / self.model_cfg['temperature']
+                logits = self.top_k_logits(
+                    logits,
+                    top_k=self.model_cfg['top_k'],
+                    top_p=self.model_cfg['top_p'])
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1)
+                prev_token = prev[0].item()
+                if prev_token >= vocab_size:
+                    prev_token = 100
+                    prev[0] = 100
+                if prev_token == 102 and len(all_generate_tokens) > int(
+                        max(1, out_length) * 0.8):
+                    break
+                if prev_token == 102:
+                    counter += 1
+                    continue
+                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
+                generate_tokens.append(prev_token)
+                all_generate_tokens.append(prev_token)
+                counter += 1
+
+            generate_context = []
+            for token in all_generate_tokens:
+                if generate_context and generate_context[
+                        -1] == 100 and token == 100:
+                    continue
+                else:
+                    generate_context.append(token)
+            return {'generate_context': generate_context}
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
new file mode 100644
index 00000000..9d2bb14f
--- /dev/null
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -0,0 +1,1054 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import math
+import os
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from megatron import mpu
+from torch import nn
+
+from modelscope.utils.nlp.distributed import (normal_init_method,
+                                              scaled_init_method)
+from .configuration_plug import PlugNLGConfig, PlugNLUConfig
+
+logger = logging.getLogger(__name__)
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(
+                seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float(
+            ) + token_type_embeddings.float()
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module in [
+                'all', 'encoder', 'encoder_self', 'encoder_selfvo',
+                'encoder_selfo'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.self = mpu.BertParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            dropout_prob=config.attention_probs_dropout_prob,
+            output_parallel=True,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            separate=config.attn_separate,
+            pruning_method=config.pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.output = BertSelfOutput(config)
+
+    def forward(
+        self,
+        input_tensor,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        if self.LayerNorm is not None:
+            ln_input = input_tensor
+            previous_type = input_tensor.type()
+            if self.fp32_layernorm:
+                ln_input = input_tensor.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            self_output = self.self(
+                ln_output,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        else:
+            self_output = self.self(
+                input_tensor,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        output_pruning_threshold = pruning_threshold
+
+        attention_output = self.output(
+            self_output,
+            input_tensor,
+            pruning_threshold=output_pruning_threshold,
+        )
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = mpu.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=True,
+            gather_output=False,
+            stride=1,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(
+        self,
+        hidden_states,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        attention_output = self.attention(
+            hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+        if self.LayerNorm is not None:
+            ln_input = attention_output
+            previous_type = attention_output.type()
+            if self.fp32_layernorm:
+                ln_input = attention_output.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            intermediate_output = self.intermediate(
+                ln_output, pruning_threshold=pruning_threshold)
+        else:
+            intermediate_output = self.intermediate(
+                attention_output, pruning_threshold=pruning_threshold)
+        layer_output = self.output(
+            intermediate_output,
+            attention_output,
+            pruning_threshold=pruning_threshold)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        all_encoder_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_, inputs[1], pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    attention_mask * 1)
+                if detach_index == layer_idx:
+                    hidden_states.detach_()
+                layer_idx += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+                if detach_index == i:
+                    hidden_states.detach_()
+                if i == len(self.layer) - 1 and self.LayerNorm is not None:
+                    previous_type = hidden_states.type()
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.float()
+                    hidden_states = self.LayerNorm(hidden_states)
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.type(previous_type)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.LayerNorm is not None:
+                previous_type = hidden_states.type()
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.float()
+                hidden_states = self.LayerNorm(hidden_states)
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.type(previous_type)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder_weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+        self.converted = False
+        self.timers = SynchronizedWallClockTimer()
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        self.timers('final linear gather').start()
+        hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        self.timers('final linear gather').stop()
+        hidden_states = F.linear(
+            self.type_converter(hidden_states),
+            self.type_converter(self.decoder_weight),
+            self.type_converter(self.bias))
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, PlugNLUConfig) and not isinstance(
+                config, PlugNLGConfig):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `BertConfig`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as
+            described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.encoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            checkpoint_activations=checkpoint_activations,
+            detach_index=detach_index,
+            pruning_threshold=pruning_threshold)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+
+        pooled_output = sequence_output[:, 0]
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class DecodeLayer(nn.Module):
+
+    def __init__(self, config):
+        super(DecodeLayer, self).__init__()
+        init_method = normal_init_method(
+            mean=0.0, std=config.initializer_range)
+        output_layer_init_method = scaled_init_method(
+            mean=0.0,
+            std=config.initializer_range,
+            num_layers=config.num_hidden_layers)
+
+        self_pruning_method = config.pruning_method
+        cross_pruning_method = config.pruning_method
+        ffn_pruning_method = config.pruning_method
+
+        if config.ft_module is not None:
+            if 'decoder_self' in config.ft_module:
+                self_pruning_method = 'finetune'
+            if 'decoder_cross' in config.ft_module:
+                cross_pruning_method = 'finetune'
+            if 'decoder_ffn' in config.ft_module:
+                ffn_pruning_method = 'finetune'
+
+        self.attention = mpu.GPT2ParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=self_pruning_method if config.pruning_module in [
+                'all', 'decoder', 'decoder_self', 'decoder_self+ffn'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.cross_attention = mpu.PalmParallelCrossAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            attn_separate=False,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=cross_pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.input_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_cross_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.intermediate = mpu.ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            gather_output=False,
+            init_method=init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.output = mpu.RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_layernorm:
+                return tensor.float()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+
+    # def forward(self, hidden_states, enc_attn_mask, dec_attn_mask):
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                is_infer=False,
+                pruning_threshold=None):
+        residual = hidden_states
+        previous_type = hidden_states.type()
+        hidden_states = self.input_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.attention(
+            hidden_states,
+            dec_attn_mask,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.cross_attention(
+            hidden_states,
+            enc_hidden_states,
+            enc_attn_mask,
+            pruning_threshold=pruning_threshold)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_cross_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.intermediate(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        hidden_states = self.output(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BertDecoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertDecoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [DecodeLayer(config) for _ in range(config.dec_hidden_layers)])
+
+        self.final_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                checkpoint_activations=False,
+                output_all_encoded_layers=False,
+                is_infer=False,
+                pruning_threshold=None):
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_,
+                        inputs[1],
+                        inputs[2],
+                        dec_attn_mask * 1,
+                        is_infer=is_infer,
+                        pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        pre_enc_hidden = enc_hidden_states.data
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    enc_hidden_states, enc_attn_mask * 1)
+                enc_hidden_states.data = pre_enc_hidden
+                layer_idx += chunk_length
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(
+                    hidden_states,
+                    enc_hidden_states,
+                    enc_attn_mask,
+                    dec_attn_mask,
+                    is_infer=is_infer,
+                    pruning_threshold=pruning_threshold)
+
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.final_layernorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+
+        return [hidden_states]
+
+
+class DecodeModel(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(DecodeModel, self).__init__(config)
+        self.decoder = BertDecoder(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                embeddings,
+                sequence_output,
+                decode_input_ids,
+                position_ids=None,
+                enc_attn_mask=None,
+                dec_attn_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                pruning_threshold=None):
+        extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.decoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = embeddings(decode_input_ids)
+        sequence_output = self.decoder(
+            embedding_output,
+            sequence_output,
+            extended_attention_mask,
+            dec_attn_mask,
+            checkpoint_activations=False,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+        return sequence_output[-1]
+
+
+class PalmForPreTraining(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(PalmForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.decoder = DecodeModel(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                decode_input_ids=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                lm_labels=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True,
+                pruning_threshold=None):
+        if sequence_output is None:
+            sequence_output, pooled_output = self.bert(
+                input_ids,
+                token_type_ids,
+                attention_mask,
+                output_all_encoded_layers=False,
+                checkpoint_activations=checkpoint_activations,
+                pruning_threshold=pruning_threshold)
+            prediction_scores, seq_relationship_score = self.cls(
+                sequence_output, pooled_output)
+        else:
+            prediction_scores = None
+            sequence_output = sequence_output.to(
+                dtype=next(self.decoder.parameters()).dtype)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        decode_output = self.decoder(
+            self.bert.embeddings,
+            sequence_output,
+            decode_input_ids,
+            position_ids,
+            attention_mask,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        transformer_output_parallel = mpu.copy_to_model_parallel_region(
+            decode_output)
+
+        logits_parallel = F.linear(transformer_output_parallel,
+                                   self.bert.embeddings.word_embeddings.weight)
+
+        if parallel_output:
+            return prediction_scores, logits_parallel
+        if is_infer:
+            return prediction_scores, mpu.gather_from_model_parallel_region(
+                logits_parallel), sequence_output
+        return prediction_scores, mpu.gather_from_model_parallel_region(
+            logits_parallel)
+
+
+class PlugModel(torch.nn.Module):
+
+    def __init__(self, config):
+        super(PlugModel, self).__init__()
+        self.config = config
+        self.model = PalmForPreTraining(self.config)
+
+    def forward(self,
+                input_tokens,
+                token_type_ids=None,
+                attention_mask=None,
+                target_tokens=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True):
+        return self.model(
+            input_tokens,
+            token_type_ids,
+            attention_mask,
+            target_tokens,
+            position_ids,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            sequence_output=sequence_output,
+            parallel_output=parallel_output)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.model.state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index d4f9c6bf..5369220f 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -1,7 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
+from functools import partial
+from multiprocessing import Pool
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union
 
@@ -15,8 +18,10 @@ from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
 from modelscope.utils.device import (create_device, device_placement,
                                      verify_device)
+from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import _find_free_port, _is_free_port
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -302,3 +307,106 @@ class Pipeline(ABC):
                 output should have the standard output name.
         """
         raise NotImplementedError('postprocess')
+
+
+class DistributedPipeline(Pipeline):
+    """This pipeline is used to load multi gpu models.
+
+    What will this class do:
+    1. Read the global config from the configuration.json
+    2. Set the multiprocessing method to spawn
+    3. Open a multiprocessing pool of the world_size to instantiate model pieces.
+    4. Set the master port and ip
+    5. Call _instantiate_one to instantiate one model piece
+        This method should be implemented by the derived class.
+    6. After the forward method is called, do preprocess in main process
+        and call _forward_one to collect results, and do
+        post process in main process.
+
+    NOTE: _instantiate_one and _forward_one are class methods, any derived class should implement them and
+    store the model handler in the class field.
+    """
+
+    def __init__(self,
+                 model: str = None,
+                 preprocessor: Union[Preprocessor, List[Preprocessor]] = None,
+                 auto_collate=True,
+                 **kwargs):
+        self.preprocessor = preprocessor
+        self._model_prepare = False
+        self._model_prepare_lock = Lock()
+        self._auto_collate = auto_collate
+
+        if os.path.exists(model):
+            self.model_dir = model
+        else:
+            self.model_dir = snapshot_download(model)
+        self.cfg = read_config(self.model_dir)
+        self.world_size = self.cfg.model.world_size
+        self.model_pool = None
+        self.device_name = 'cpu'
+        self.device = create_device(self.device_name)
+        self.has_multiple_models = False
+        self.framework = self.cfg.framework
+        if torch.multiprocessing.get_start_method(allow_none=True) is None:
+            torch.multiprocessing.set_start_method('spawn')
+
+        ranks = list(range(self.world_size))
+        self.model_pool = Pool(self.world_size)
+        master_ip = '127.0.0.1' if 'master_ip' not in kwargs else kwargs[
+            'master_ip']
+        master_port = '29500' if 'master_port' not in kwargs else kwargs[
+            'master_port']
+        if not _is_free_port(int(master_port)):
+            master_port = str(_find_free_port())
+        self.model_pool.map(
+            partial(
+                self.__class__._instantiate_one,
+                model_dir=self.model_dir,
+                master_ip=master_ip,
+                master_port=master_port,
+                **self.cfg.model,
+                **kwargs), ranks)
+
+    def __del__(self):
+        if hasattr(self, 'model_pool') and self.model_pool is not None:
+            self.model_pool.terminate()
+
+    def __getstate__(self):
+        self_dict = self.__dict__.copy()
+        del self_dict['model_pool']
+        del self_dict['preprocessor']
+        del self_dict['_model_prepare_lock']
+        return self_dict
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        """Instantiate one model piece.
+
+        @param rank: The model rank.
+        @param model_dir: The model_dir in the node.
+        @param kwargs: Any extra args.
+        @return: None. The model handler should be kept in the class field.
+        """
+        pass
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {
+            'inputs': inputs,
+            'forward_params': forward_params,
+        }
+        res = self.model_pool.map(self.__class__._forward_one,
+                                  [inputs] * self.world_size)
+        return res[0]
+
+    @classmethod
+    def _forward_one(cls, inputs):
+        """Forward the inputs to one model piece.
+
+        Use the model handler kept in the class field to forward.
+
+        @param inputs: The inputs after the preprocessing.
+        @return: The forward results.
+        """
+        pass
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
new file mode 100644
index 00000000..202e6213
--- /dev/null
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -0,0 +1,107 @@
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp.plug import DistributedPlug
+from modelscope.pipelines.base import DistributedPipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.plug_generation)
+class DistributedPlugPipeline(DistributedPipeline):
+    """This class is used to instantiate the plug model.
+    """
+
+    model = None
+
+    def __init__(self,
+                 model,
+                 preprocessor=None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Create a plug pipeline instance.
+
+        @param model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        The default path to damo/nlp_plug_text-generation_27B can be obtained by function
+        get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
+        this path before calling this class by model_id.
+        The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        /your/path/to/damo/nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+            be used as default.
+        @param first_sequence: The first_sequence key name if the input format is a dict.
+        @param kwargs:
+            sequence_length: The input sequence_length.
+        """
+        if preprocessor is None:
+            preprocessor = TextGenerationPreprocessor(
+                model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model, preprocessor=preprocessor, **kwargs)
+        assert hasattr(preprocessor, 'tokenizer')
+        self.cls_token_id = preprocessor.tokenizer.cls_token_id
+
+    @classmethod
+    def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return cls.model.generate(inputs['inputs'],
+                                      **inputs['forward_params'])
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        batch_size = inputs['input_ids'].shape[0]
+        dec_input_ids = torch.full([batch_size, 1],
+                                   self.cls_token_id,
+                                   dtype=torch.long)
+        inputs['dec_input_ids'] = dec_input_ids
+        res = super().forward(inputs, **forward_params)
+        return res
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        cls.model = DistributedPlug(model_dir, rank, **kwargs)
+        cls.model.eval()
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        from modelscope.outputs import OutputKeys
+        generate_context = inputs['generate_context']
+        generate_context = ''.join(
+            self.preprocessor.tokenizer.convert_ids_to_tokens(
+                generate_context)).replace('[UNK]', '“').replace('##', '')
+        return {OutputKeys.TEXT: generate_context}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 825611d6..cfb8c9e8 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -164,7 +164,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         """
 
         model_type = get_model_type(model_dir)
-        if model_type in (Models.structbert, Models.gpt3, Models.palm):
+        if model_type in (Models.structbert, Models.gpt3, Models.palm,
+                          Models.plug):
             from modelscope.models.nlp.structbert import SbertTokenizer
             return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
         elif model_type == Models.veco:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 614b728a..d011dd4a 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -39,7 +39,8 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import get_dist_info, init_dist
+from modelscope.utils.torch_utils import (get_dist_info, init_dist,
+                                          set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -922,6 +923,4 @@ def worker_init_fn(worker_id, num_workers, rank, seed):
     # The seed of each worker equals to
     # num_worker * rank + worker_id + user_seed
     worker_seed = num_workers * rank + worker_id + seed
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-    torch.manual_seed(worker_seed)
+    set_random_seed(worker_seed)
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
new file mode 100755
index 00000000..2b590a10
--- /dev/null
+++ b/modelscope/utils/nlp/distributed.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+import torch.distributed as dist
+from megatron import mpu
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+
+from modelscope.utils.torch_utils import init_dist
+
+
+def initialize_distributed(rank, mpu, world_size, model_parallel_size,
+                           master_ip, master_port):
+    """Initialize torch.distributed."""
+    # Manually set the device ids.
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend='nccl', world_size=8, rank=rank, init_method=init_method)
+    # Set the model-parallel communicators.
+    mpu.initialize_model_parallel(model_parallel_size)
+
+
+def normal_init_method(mean, std):
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow.',
+                            'It is recommended to use the NCCL backend in this case.'
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/utils/nlp/load_checkpoint.py b/modelscope/utils/nlp/load_checkpoint.py
new file mode 100755
index 00000000..6534e18d
--- /dev/null
+++ b/modelscope/utils/nlp/load_checkpoint.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+
+def load_checkpoint(model,
+                    load_dir,
+                    tag,
+                    load_module_strict=True,
+                    load_optimizer_states=True,
+                    load_lr_scheduler_states=True):
+    r"""Load training checkpoint
+
+    Arguments:
+        load_dir: Required. Directory to load the checkpoint from
+        tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+        load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and
+         checkpoint match.
+        load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint.
+         Ex. ADAM's momentum and variance
+        load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
+    Return:
+        load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
+        client_state: State dictionary used for loading required training states in the client code.
+    """
+
+    load_path, client_states = _load_checkpoint(
+        model,
+        load_dir,
+        tag,
+        load_module_strict=load_module_strict,
+        load_optimizer_states=load_optimizer_states,
+        load_lr_scheduler_states=load_lr_scheduler_states)
+
+    if load_optimizer_states:
+        if model.zero_optimization() and load_path is not None:
+            model._load_zero_checkpoint(
+                load_dir, tag, load_optimizer_states=load_optimizer_states)
+
+    return load_path, client_states
+
+
+def _get_ckpt_name(mpu, checkpoints_path, tag):
+    mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
+    ckpt_name = os.path.join(
+        checkpoints_path, str(tag),
+        'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
+    return ckpt_name
+
+
+def pre_load(mpu, load_dir, tag=''):
+    load_path = _get_ckpt_name(mpu, load_dir, tag)
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+    return checkpoint['module']
+
+
+def _load_checkpoint(model,
+                     load_dir,
+                     tag,
+                     load_module_strict=True,
+                     load_optimizer_states=True,
+                     load_lr_scheduler_states=True):
+
+    load_path = model._get_ckpt_name(load_dir, tag)
+
+    if not os.path.exists(load_path):
+        return None, None
+
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+
+    model.load_module_state_dict(
+        state_dict=checkpoint['module'], strict=load_module_strict)
+    if not model.zero_optimization() and load_optimizer_states:
+        if model.fp16_enabled():
+            model.optimizer.load_state_dict(
+                checkpoint['optimizer'],
+                load_optimizer_states=load_optimizer_states)
+        elif load_optimizer_states:
+            model.optimizer.load_state_dict(checkpoint['optimizer'])
+
+    if load_lr_scheduler_states and model.lr_scheduler is not None:
+        model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+
+    model.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
+    model.global_steps = checkpoint['global_steps']
+    model.global_samples = checkpoint.get(
+        'global_samples', model.global_steps * model.train_batch_size())
+    model.skipped_steps = checkpoint['skipped_steps']
+    model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
+    model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
+    deepspeed_states = [
+        'module', 'optimizer', 'lr_scheduler', 'csr_tensor_module_names',
+        'skipped_steps', 'global_steps', 'dp_world_size', 'mp_world_size'
+    ]
+    client_state = {
+        key: value
+        for key, value in checkpoint.items() if key not in deepspeed_states
+    }
+
+    return load_path, client_state
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 45e33c3e..eaa285a2 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -3,16 +3,16 @@
 import functools
 import os
 import pickle
+import random
 import socket
 import subprocess
 import tempfile
 from typing import Callable, List, Optional, Tuple
 
+import numpy as np
 import torch
 import torch.multiprocessing as mp
 from torch import distributed as dist
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
 
 
 def _find_free_port() -> str:
@@ -49,7 +49,6 @@ def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
 def _init_dist_pytorch(backend: str, **kwargs) -> None:
     # rank = int(os.environ['RANK'])
     local_rank = int(os.environ['LOCAL_RANK'])
-
     torch.cuda.set_device(local_rank)
     dist.init_process_group(backend=backend, **kwargs)
 
@@ -180,3 +179,19 @@ def broadcast(inputs, src):
     dist.broadcast(inputs_tensor, src)
 
     return pickle.loads(inputs_tensor.cpu().numpy().tobytes())
+
+
+def set_random_seed(seed):
+    if seed is not None and seed >= 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+    else:
+        raise ValueError(
+            f'Random seed should be positive, current seed is {seed}')
+
+
+def set_random_seed_mpu(seed):
+    from megatron import mpu
+    set_random_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index ada4fc50..cf0468bb 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,6 +1,8 @@
+deepspeed
 en_core_web_sm>=2.3.5
 fairseq>=0.10.2
 jieba>=0.42.1
+megatron_util
 pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
diff --git a/tests/pipelines/test_plug_text_generation.py b/tests/pipelines/test_plug_text_generation.py
new file mode 100644
index 00000000..90b48efa
--- /dev/null
+++ b/tests/pipelines/test_plug_text_generation.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+
+class TextPlugGenerationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # please make sure this local path exists.
+        self.model_id = 'damo/nlp_plug_text-generation_27B'
+        self.model_dir = snapshot_download(self.model_id)
+        self.plug_input = '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。"'
+
+    @unittest.skip('distributed plug, skipped')
+    def test_plug(self):
+        """ The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        """
+        # download model binaries to <model_dir>/model
+        pipe = pipeline(Tasks.text_generation, model=self.model_id)
+        print(
+            f'input: {self.plug_input}\noutput: {pipe(self.plug_input, out_length=256)}'
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9cbf246a8c4a09be20d5a32cea728f2250faa305 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 6 Sep 2022 10:02:49 +0800
Subject: [PATCH 065/175] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EUL?=
 =?UTF-8?q?FD=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 完成Maas-cv CR标准 自查
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9957634
---
 data/test/images/ulfd_face_detection.jpg      |   3 +
 modelscope/metainfo.py                        |   2 +
 .../models/cv/face_detection/__init__.py      |   5 +-
 .../cv/face_detection/ulfd_slim/__init__.py   |   1 +
 .../cv/face_detection/ulfd_slim/detection.py  |  44 ++++++
 .../ulfd_slim/vision/__init__.py              |   0
 .../ulfd_slim/vision/box_utils.py             | 124 +++++++++++++++++
 .../ulfd_slim/vision/mb_tiny.py               |  49 +++++++
 .../ulfd_slim/vision/ssd/__init__.py          |   0
 .../vision/ssd/data_preprocessing.py          |  18 +++
 .../ulfd_slim/vision/ssd/fd_config.py         |  49 +++++++
 .../ulfd_slim/vision/ssd/mb_tiny_fd.py        | 124 +++++++++++++++++
 .../ulfd_slim/vision/ssd/predictor.py         |  80 +++++++++++
 .../ulfd_slim/vision/ssd/ssd.py               | 129 ++++++++++++++++++
 .../ulfd_slim/vision/transforms.py            |  56 ++++++++
 modelscope/pipelines/cv/__init__.py           |   4 +-
 .../cv/ulfd_face_detection_pipeline.py        |  56 ++++++++
 modelscope/utils/cv/image_utils.py            |  21 +++
 tests/pipelines/test_ulfd_face_detection.py   |  36 +++++
 19 files changed, 798 insertions(+), 3 deletions(-)
 create mode 100644 data/test/images/ulfd_face_detection.jpg
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/__init__.py
 create mode 100755 modelscope/models/cv/face_detection/ulfd_slim/detection.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
 create mode 100644 modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
 create mode 100644 modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
 create mode 100644 tests/pipelines/test_ulfd_face_detection.py

diff --git a/data/test/images/ulfd_face_detection.jpg b/data/test/images/ulfd_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/ulfd_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 792bd708..22c2d99e 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -35,6 +35,7 @@ class Models(object):
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
+    ulfd = 'ulfd'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -122,6 +123,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
     live_category = 'live-category'
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index a3c47164..63ff1b83 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -5,10 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .retinaface import RetinaFaceDetection
-
+    from .ulfd_slim import UlfdFaceDetector
 else:
     _import_structure = {
-        'retinaface': ['RetinaFaceDetection'],
+        'ulfd_slim': ['UlfdFaceDetector'],
+        'retinaface': ['RetinaFaceDetection']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
new file mode 100644
index 00000000..41a2226a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
@@ -0,0 +1 @@
+from .detection import UlfdFaceDetector
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/detection.py b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
new file mode 100755
index 00000000..c0e2da6e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
@@ -0,0 +1,44 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .vision.ssd.fd_config import define_img_size
+from .vision.ssd.mb_tiny_fd import (create_mb_tiny_fd,
+                                    create_mb_tiny_fd_predictor)
+
+define_img_size(640)
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.ulfd)
+class UlfdFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = create_mb_tiny_fd(2, is_test=True, device=device)
+        self.predictor = create_mb_tiny_fd_predictor(
+            self.net, candidate_size=1500, device=device)
+        self.net.load(model_path)
+        self.net = self.net.to(device)
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+        prob_th = 0.85
+        keep_top_k = 750
+        boxes, labels, probs = self.predictor.predict(img, keep_top_k, prob_th)
+        return boxes, probs
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
new file mode 100644
index 00000000..46d3b890
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import math
+
+import torch
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    _, indexes = scores.sort(descending=True)
+    indexes = indexes[:candidate_size]
+    while len(indexes) > 0:
+        current = indexes[0]
+        picked.append(current.item())
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[1:]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            current_box.unsqueeze(0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def nms(box_scores,
+        nms_method=None,
+        score_threshold=None,
+        iou_threshold=None,
+        sigma=0.5,
+        top_k=-1,
+        candidate_size=200):
+    return hard_nms(
+        box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+
+def generate_priors(feature_map_list,
+                    shrinkage_list,
+                    image_size,
+                    min_boxes,
+                    clamp=True) -> torch.Tensor:
+    priors = []
+    for index in range(0, len(feature_map_list[0])):
+        scale_w = image_size[0] / shrinkage_list[0][index]
+        scale_h = image_size[1] / shrinkage_list[1][index]
+        for j in range(0, feature_map_list[1][index]):
+            for i in range(0, feature_map_list[0][index]):
+                x_center = (i + 0.5) / scale_w
+                y_center = (j + 0.5) / scale_h
+
+                for min_box in min_boxes[index]:
+                    w = min_box / image_size[0]
+                    h = min_box / image_size[1]
+                    priors.append([x_center, y_center, w, h])
+    priors = torch.tensor(priors)
+    if clamp:
+        torch.clamp(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    # priors can have one dimension less.
+    if priors.dim() + 1 == locations.dim():
+        priors = priors.unsqueeze(0)
+    a = locations[..., :2] * center_variance * priors[...,
+                                                      2:] + priors[..., :2]
+    b = torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+
+    return torch.cat([a, b], dim=locations.dim() - 1)
+
+
+def center_form_to_corner_form(locations):
+    a = locations[..., :2] - locations[..., 2:] / 2
+    b = locations[..., :2] + locations[..., 2:] / 2
+    return torch.cat([a, b], locations.dim() - 1)
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom) -> torch.Tensor:
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = torch.clamp(right_bottom - left_top, min=0.0)
+    return hw[..., 0] * hw[..., 1]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
new file mode 100644
index 00000000..8bbcef41
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Mb_Tiny(nn.Module):
+
+    def __init__(self, num_classes=2):
+        super(Mb_Tiny, self).__init__()
+        self.base_channel = 8 * 2
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup), nn.ReLU(inplace=True))
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, self.base_channel, 2),  # 160*120
+            conv_dw(self.base_channel, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 2),  # 80*60
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 4, 2),  # 40*30
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 8, 2),  # 20*15
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 16, 2),  # 10*8
+            conv_dw(self.base_channel * 16, self.base_channel * 16, 1))
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = F.avg_pool2d(x, 7)
+        x = x.view(-1, 1024)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
new file mode 100644
index 00000000..9251d67f
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
@@ -0,0 +1,18 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from ..transforms import Compose, Resize, SubtractMeans, ToTensor
+
+
+class PredictionTransform:
+
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            Resize(size),
+            SubtractMeans(mean), lambda img, boxes=None, labels=None:
+            (img / std, boxes, labels),
+            ToTensor()
+        ])
+
+    def __call__(self, image):
+        image, _, _ = self.transform(image)
+        return image
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
new file mode 100644
index 00000000..495a2fcd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import numpy as np
+
+from ..box_utils import generate_priors
+
+image_mean_test = image_mean = np.array([127, 127, 127])
+image_std = 128.0
+iou_threshold = 0.3
+center_variance = 0.1
+size_variance = 0.2
+
+min_boxes = [[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]]
+shrinkage_list = []
+image_size = [320, 240]  # default input size 320*240
+feature_map_w_h_list = [[40, 20, 10, 5], [30, 15, 8,
+                                          4]]  # default feature map size
+priors = []
+
+
+def define_img_size(size):
+    global image_size, feature_map_w_h_list, priors
+    img_size_dict = {
+        128: [128, 96],
+        160: [160, 120],
+        320: [320, 240],
+        480: [480, 360],
+        640: [640, 480],
+        1280: [1280, 960]
+    }
+    image_size = img_size_dict[size]
+
+    feature_map_w_h_list_dict = {
+        128: [[16, 8, 4, 2], [12, 6, 3, 2]],
+        160: [[20, 10, 5, 3], [15, 8, 4, 2]],
+        320: [[40, 20, 10, 5], [30, 15, 8, 4]],
+        480: [[60, 30, 15, 8], [45, 23, 12, 6]],
+        640: [[80, 40, 20, 10], [60, 30, 15, 8]],
+        1280: [[160, 80, 40, 20], [120, 60, 30, 15]]
+    }
+    feature_map_w_h_list = feature_map_w_h_list_dict[size]
+
+    for i in range(0, len(image_size)):
+        item_list = []
+        for k in range(0, len(feature_map_w_h_list[i])):
+            item_list.append(image_size[i] / feature_map_w_h_list[i][k])
+        shrinkage_list.append(item_list)
+    priors = generate_priors(feature_map_w_h_list, shrinkage_list, image_size,
+                             min_boxes)
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
new file mode 100644
index 00000000..91ed268d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from torch.nn import Conv2d, ModuleList, ReLU, Sequential
+
+from ..mb_tiny import Mb_Tiny
+from . import fd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+
+
+def SeperableConv2d(in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding),
+        ReLU(),
+        Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mb_tiny_fd(num_classes, is_test=False, device='cuda'):
+    base_net = Mb_Tiny(2)
+    base_net_model = base_net.model  # disable dropout layer
+
+    source_layer_indexes = [8, 11, 13]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(
+                in_channels=base_net.base_channel * 16,
+                out_channels=base_net.base_channel * 4,
+                kernel_size=1), ReLU(),
+            SeperableConv2d(
+                in_channels=base_net.base_channel * 4,
+                out_channels=base_net.base_channel * 16,
+                kernel_size=3,
+                stride=2,
+                padding=1), ReLU())
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    return SSD(
+        num_classes,
+        base_net_model,
+        source_layer_indexes,
+        extras,
+        classification_headers,
+        regression_headers,
+        is_test=is_test,
+        config=config,
+        device=device)
+
+
+def create_mb_tiny_fd_predictor(net,
+                                candidate_size=200,
+                                nms_method=None,
+                                sigma=0.5,
+                                device=None):
+    predictor = Predictor(
+        net,
+        config.image_size,
+        config.image_mean_test,
+        config.image_std,
+        nms_method=nms_method,
+        iou_threshold=config.iou_threshold,
+        candidate_size=candidate_size,
+        sigma=sigma,
+        device=device)
+    return predictor
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
new file mode 100644
index 00000000..f71820a5
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
@@ -0,0 +1,80 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch
+
+from .. import box_utils
+from .data_preprocessing import PredictionTransform
+
+
+class Predictor:
+
+    def __init__(self,
+                 net,
+                 size,
+                 mean=0.0,
+                 std=1.0,
+                 nms_method=None,
+                 iou_threshold=0.3,
+                 filter_threshold=0.85,
+                 candidate_size=200,
+                 sigma=0.5,
+                 device=None):
+        self.net = net
+        self.transform = PredictionTransform(size, mean, std)
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+
+        self.sigma = sigma
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+
+        self.net.to(self.device)
+        self.net.eval()
+
+    def predict(self, image, top_k=-1, prob_threshold=None):
+        height, width, _ = image.shape
+        image = self.transform(image)
+        images = image.unsqueeze(0)
+        images = images.to(self.device)
+        with torch.no_grad():
+            for i in range(1):
+                scores, boxes = self.net.forward(images)
+        boxes = boxes[0]
+        scores = scores[0]
+        if not prob_threshold:
+            prob_threshold = self.filter_threshold
+        # this version of nms is slower on GPU, so we move data to CPU.
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(
+                box_probs,
+                self.nms_method,
+                score_threshold=prob_threshold,
+                iou_threshold=self.iou_threshold,
+                sigma=self.sigma,
+                top_k=top_k,
+                candidate_size=self.candidate_size)
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.tensor([]), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        picked_box_probs[:, 0] *= width
+        picked_box_probs[:, 1] *= height
+        picked_box_probs[:, 2] *= width
+        picked_box_probs[:, 3] *= height
+        return picked_box_probs[:, :4], torch.tensor(
+            picked_labels), picked_box_probs[:, 4]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
new file mode 100644
index 00000000..08ff93a4
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
@@ -0,0 +1,129 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from collections import namedtuple
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .. import box_utils
+
+GraphPath = namedtuple('GraphPath', ['s0', 'name', 's1'])
+
+
+class SSD(nn.Module):
+
+    def __init__(self,
+                 num_classes: int,
+                 base_net: nn.ModuleList,
+                 source_layer_indexes: List[int],
+                 extras: nn.ModuleList,
+                 classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList,
+                 is_test=False,
+                 config=None,
+                 device=None):
+        """Compose a SSD model using the given components.
+        """
+        super(SSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.is_test = is_test
+        self.config = config
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([
+            t[1] for t in source_layer_indexes
+            if isinstance(t, tuple) and not isinstance(t, GraphPath)
+        ])
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+        if is_test:
+            self.config = config
+            self.priors = config.priors.to(self.device)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        end_layer_index = 0
+        for end_layer_index in self.source_layer_indexes:
+            if isinstance(end_layer_index, GraphPath):
+                path = end_layer_index
+                end_layer_index = end_layer_index.s0
+                added_layer = None
+            elif isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+                path = None
+            else:
+                added_layer = None
+                path = None
+            for layer in self.base_net[start_layer_index:end_layer_index]:
+                x = layer(x)
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            if path:
+                sub = getattr(self.base_net[end_layer_index], path.name)
+                for layer in sub[:path.s1]:
+                    x = layer(x)
+                y = x
+                for layer in sub[path.s1:]:
+                    x = layer(x)
+                end_layer_index += 1
+            start_layer_index = end_layer_index
+            confidence, location = self.compute_header(header_index, y)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            confidence, location = self.compute_header(header_index, x)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+
+        if self.is_test:
+            confidences = F.softmax(confidences, dim=2)
+            boxes = box_utils.convert_locations_to_boxes(
+                locations, self.priors, self.config.center_variance,
+                self.config.size_variance)
+            boxes = box_utils.center_form_to_corner_form(boxes)
+            return confidences, boxes
+        else:
+            return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def load(self, model):
+        self.load_state_dict(
+            torch.load(model, map_location=lambda storage, loc: storage))
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
new file mode 100644
index 00000000..7c5331f1
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
@@ -0,0 +1,56 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import types
+
+import cv2
+import numpy as np
+import torch
+from numpy import random
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+        return img, boxes, labels
+
+
+class SubtractMeans(object):
+
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class Resize(object):
+
+    def __init__(self, size=(300, 300)):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size[0], self.size[1]))
+        return image, boxes, labels
+
+
+class ToTensor(object):
+
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(
+            2, 0, 1), boxes, labels
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 72a225ff..02682fa0 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -46,8 +46,9 @@ if TYPE_CHECKING:
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .shop_segmentation_pipleline import ShopSegmentationPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
-    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
+    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
 
@@ -110,6 +111,7 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'ulfd_face_detection_pipeline': ['UlfdFaceDetectionPipeline'],
         'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
         ['FacialExpressionRecognitionPipeline']
diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
new file mode 100644
index 00000000..1263082b
--- /dev/null
+++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
@@ -0,0 +1,56 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import UlfdFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.ulfd_face_detection)
+class UlfdFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = UlfdFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0].tolist()
+        scores = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index cb07ba1a..6175a53f 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -89,6 +89,27 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_face_detection_no_lm_result(img_path, detection_result):
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        score = scores[i]
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    print(f'Found {len(scores)} faces')
+    return img
+
+
 def draw_facial_expression_result(img_path, facial_expression_result):
     label_idx = facial_expression_result[OutputKeys.LABELS]
     map_list = [
diff --git a/tests/pipelines/test_ulfd_face_detection.py b/tests/pipelines/test_ulfd_face_detection.py
new file mode 100644
index 00000000..0ffa688c
--- /dev/null
+++ b/tests/pipelines/test_ulfd_face_detection.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class UlfdFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_ulfd'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/ulfd_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4c5afd22d401f6fa1174857c65e8bcc18998e0df Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Tue, 6 Sep 2022 14:55:56 +0800
Subject: [PATCH 066/175] [to #42322933]fix: rm plug_for_text_generation in
 nlp/__init__.py         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10029585

---
 modelscope/models/nlp/plug/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
index b74258a4..dbc20751 100644
--- a/modelscope/models/nlp/plug/__init__.py
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -7,13 +7,11 @@ if TYPE_CHECKING:
     from .configuration_plug import PlugNLGConfig
     from .modeling_plug import PlugModel
     from .distributed_plug import DistributedPlug
-    from .plug_for_text_generation import PlugForTextGeneration
 else:
     _import_structure = {
         'configuration_plug': ['PlugNLGConfig'],
         'modeling_plug': ['PlugModel'],
         'distributed_plug': ['DistributedPlug'],
-        'plug_for_text_generation': ['PlugForTextGeneration'],
     }
 
     import sys

From 01e768503c0ecb9e4cc09e3ac01ced1c5f35dd8b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 6 Sep 2022 15:13:14 +0800
Subject: [PATCH 067/175] [to #42322933] Fix random seed for trainer

1. Fix random seed for trainer and init it at the first line of init
2. Add a regress test for fixed training
3. Change the dataset 'dureader_robust_qg' to 'DuReader_robust-QG'
4. Change some datasets from loading hf.datasets to loading msdataset.load
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10029509
---
 modelscope/trainers/trainer.py                |  7 +-
 modelscope/utils/regress_test_utils.py        | 14 ++++
 modelscope/utils/torch_utils.py               |  1 +
 .../data/test/regression/sbert-base-tnews.bin |  3 +
 .../test_finetune_sequence_classification.py  | 84 ++++++++++++++++---
 .../trainers/test_finetune_text_generation.py |  2 +-
 6 files changed, 95 insertions(+), 16 deletions(-)
 create mode 100644 tests/trainers/data/test/regression/sbert-base-tnews.bin

diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d011dd4a..fa6f8a99 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -75,6 +75,7 @@ class EpochBasedTrainer(BaseTrainer):
             this preprocessing action will be executed every time the dataset's __getitem__ is called.
         optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
             containing the optimizer and the scheduler to use.
+        seed (int): The optional random seed for torch, cuda, numpy and random.
         max_epochs: (int, optional): Total training epochs.
     """
 
@@ -93,8 +94,11 @@ class EpochBasedTrainer(BaseTrainer):
                               torch.optim.lr_scheduler._LRScheduler] = (None,
                                                                         None),
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
             **kwargs):
 
+        self._seed = seed
+        set_random_seed(self._seed)
         if isinstance(model, str):
             if os.path.exists(model):
                 self.model_dir = model if os.path.isdir(
@@ -213,9 +217,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.use_fp16 = kwargs.get('use_fp16', False)
 
-        # TODO @wenmeng.zwm add seed init fn
-        self._seed = 0
-
         if kwargs.get('launcher', None) is not None:
             init_dist(kwargs['launcher'])
 
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index ca50d579..82267447 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -133,6 +133,7 @@ class RegressTool:
                              compare_fn=None,
                              ignore_keys=None,
                              compare_random=True,
+                             reset_dropout=True,
                              lazy_stop_callback=None):
         """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
 
@@ -151,6 +152,7 @@ class RegressTool:
         @param compare_fn: A custom fn used to compare the results manually.
         @param ignore_keys: The keys to ignore of the named_parameters.
         @param compare_random: If to compare random setttings, default True.
+        @param reset_dropout: Reset all dropout modules to 0.0.
         @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
 
         >>> def compare_fn(v1, v2, key, type):
@@ -202,6 +204,18 @@ class RegressTool:
             trainer,
             '_seed') else trainer.seed if hasattr(trainer, 'seed') else None
 
+        if reset_dropout:
+            with torch.no_grad():
+
+                def reinit_dropout(_module):
+                    for name, submodule in _module.named_children():
+                        if isinstance(submodule, torch.nn.Dropout):
+                            setattr(_module, name, torch.nn.Dropout(0.))
+                        else:
+                            reinit_dropout(submodule)
+
+                reinit_dropout(module)
+
         if level == 'strict':
             hack_forward(module, file_name, io_json)
             intercept_module(module, io_json)
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index eaa285a2..6d4132f6 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -186,6 +186,7 @@ def set_random_seed(seed):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
     else:
         raise ValueError(
             f'Random seed should be positive, current seed is {seed}')
diff --git a/tests/trainers/data/test/regression/sbert-base-tnews.bin b/tests/trainers/data/test/regression/sbert-base-tnews.bin
new file mode 100644
index 00000000..3a06d49c
--- /dev/null
+++ b/tests/trainers/data/test/regression/sbert-base-tnews.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2df2a5f3cdfc6dded52d31a8e97d9a9c41a803cb6d46dee709c51872eda37b21
+size 151830
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 24f1a2fd..f2adfa22 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -10,11 +10,14 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
 from modelscope.trainers.hooks import Hook
-from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
+                                             NlpEpochBasedTrainer)
 from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
     calculate_fisher
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.data_utils import to_device
+from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.test_utils import test_level
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
@@ -28,11 +31,76 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         self.tmp_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
+        self.regress_tool = MsRegressTool(baseline=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_repeatable(self):
+        import torch  # noqa
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': [
+                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14'
+                    ],
+                    'first_sequence':
+                    'sentence',
+                    'label':
+                    'label',
+                }
+            }
+            cfg.train.max_epochs = 5
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        dataset = MsDataset.load('clue', subset_name='tnews')
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            seed=42,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        with self.regress_tool.monitor_ms_train(
+                trainer, 'sbert-base-tnews', level='strict'):
+            trainer.train()
+
     def finetune(self,
                  model_id,
                  train_dataset,
@@ -54,7 +122,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(self.epoch_num):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         output_files = os.listdir(
             os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
@@ -118,11 +186,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'afqmc', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='afqmc')
         self.finetune(
             model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
@@ -182,11 +246,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'tnews', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='tnews')
 
         self.finetune(
             model_id='damo/nlp_structbert_backbone_base_std',
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
index a561effe..6aefa969 100644
--- a/tests/trainers/test_finetune_text_generation.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -129,7 +129,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
     @unittest.skip
     def test_finetune_cnndm(self):
         from modelscope.msdatasets import MsDataset
-        dataset_dict = MsDataset.load('dureader_robust_qg')
+        dataset_dict = MsDataset.load('DuReader_robust-QG')
         train_dataset = dataset_dict['train'].to_hf_dataset() \
             .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
         eval_dataset = dataset_dict['validation'].to_hf_dataset() \

From cd8ac57fdd85bd09251efa181eb937482e920a09 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 6 Sep 2022 19:06:49 +0800
Subject: [PATCH 068/175] [to #44742129] support model tag for integration

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10035056

* support model tag for integration
---
 modelscope/utils/device.py     |   4 +-
 modelscope/utils/model_tag.py  | 182 +++++++++++++++++++++++++++++++++
 modelscope/utils/test_utils.py |  32 ++++++
 tests/run.py                   |  21 +++-
 4 files changed, 237 insertions(+), 2 deletions(-)
 create mode 100644 modelscope/utils/model_tag.py

diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 77e23122..40804970 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -20,9 +20,11 @@ def verify_device(device_name):
         device info (tuple):  device_type and device_id, if device_id is not set, will use 0 as default.
     """
     device_name = device_name.lower()
-    eles = device_name.split(':')
     err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
+    assert device_name is not None and device_name != '', err_msg
+    eles = device_name.split(':')
     assert len(eles) <= 2, err_msg
+    assert device_name is not None
     assert eles[0] in ['cpu', 'cuda', 'gpu'], err_msg
     device_type = eles[0]
     device_id = None
diff --git a/modelscope/utils/model_tag.py b/modelscope/utils/model_tag.py
new file mode 100644
index 00000000..380ddccb
--- /dev/null
+++ b/modelscope/utils/model_tag.py
@@ -0,0 +1,182 @@
+import logging
+import os
+
+import json
+import requests
+
+from modelscope.version import __version__
+
+
+# 打标
+class ModelTag(object):
+    _URL = os.environ.get('MODEL_TAG_URL', None)
+
+    # 模型测试结果
+    BATCH_COMMIT_RESULT_URL = f'{_URL}/batchCommitResult'
+    # 测试阶段完成
+    BATCH_REFRESH_STAGE_URL = f'{_URL}/batchRefreshStage'
+    # query_model_stage
+    QUERY_MODEL_STAGE_URL = f'{_URL}/queryModelStage'
+
+    HEADER = {'Content-Type': 'application/json'}
+
+    # 检测结果
+    MODEL_SKIP = 0
+    MODEL_FAIL = 1
+    MODEL_PASS = 2
+
+    class ItemResult(object):
+
+        def __init__(self):
+            self.result = 0
+            self.name = ''
+            self.info = ''
+
+        def to_json(self):
+            return {
+                'name': self.name,
+                'result': self.result,
+                'info': self.info
+            }
+
+    def __init__(self):
+        self.job_name = ''
+        self.job_id = ''
+        self.model = ''
+        self.sdk_version = ''
+        self.image_version = ''
+        self.domain = ''
+        self.task = ''
+        self.source = ''
+        self.stage = ''
+        # ItemResult list
+        self.item_result = []
+
+    # 发送请求
+    def _post_request(self, url, param):
+        try:
+            logging.info(url + ' query: '
+                         + str(json.dumps(param, ensure_ascii=False)))
+            res = requests.post(
+                url=url,
+                headers=self.HEADER,
+                data=json.dumps(param, ensure_ascii=False).encode('utf8'))
+            if res.status_code == 200:
+                logging.info(f'{url} post结果: ' + res.text)
+                res_json = json.loads(res.text)
+                if int(res_json['errorCode']) == 200:
+                    return res_json['content']
+                else:
+                    logging.error(res.text)
+            else:
+                logging.error(res.text)
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型测试结果
+    def batch_commit_result(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'jobName':
+                self.job_name,
+                'jobId':
+                self.job_id,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task,
+                    'itemResult': self.item_result
+                }]
+            }
+            return self._post_request(self.BATCH_COMMIT_RESULT_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 测试阶段完成
+    def batch_refresh_stage(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'stage':
+                self.stage,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task
+                }]
+            }
+            return self._post_request(self.BATCH_REFRESH_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 查询模型某个阶段的最新测试结果（只返回单个结果
+    def query_model_stage(self):
+        try:
+            param = {
+                'sdkVersion': self.sdk_version,
+                'model': self.model,
+                'stage': self.stage,
+                'imageVersion': self.image_version
+            }
+            return self._post_request(self.QUERY_MODEL_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型UT测试结果
+    """
+        model_tag = ModelTag()
+        model_tag.model = "XXX"
+        model_tag.sdk_version = "0.3.7"
+        model_tag.domain = "nlp"
+        model_tag.task = "word-segmentation"
+        item = model_tag.ItemResult()
+        item.result = model_tag.MODEL_PASS
+        item.name = "ALL"
+        item.info = ""
+        model_tag.item_result.append(item.to_json())
+    """
+
+    def commit_ut_result(self):
+        if self._URL is not None:
+            self.job_name = 'UT'
+            self.source = 'dev'
+            self.stage = 'integration'
+
+            self.batch_commit_result()
+            self.batch_refresh_stage()
+
+
+def commit_model_ut_result(model_name, ut_result):
+    model_tag = ModelTag()
+    model_tag.model = model_name.replace('damo/', '')
+    model_tag.sdk_version = __version__
+    # model_tag.domain = ""
+    # model_tag.task = ""
+    item = model_tag.ItemResult()
+    item.result = ut_result
+    item.name = 'ALL'
+    item.info = ''
+    model_tag.item_result.append(item.to_json())
+    model_tag.commit_ut_result()
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 7adba982..b30c674b 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -11,6 +11,7 @@ import sys
 import tarfile
 import tempfile
 import unittest
+from typing import OrderedDict
 
 import requests
 from datasets import Dataset
@@ -71,6 +72,37 @@ def download_and_untar(fpath, furl, dst) -> str:
     return target_dir_path
 
 
+def get_case_model_info():
+    status_code, result = subprocess.getstatusoutput(
+        'grep -rn "damo/" tests/  | grep -v ".pyc" | grep -v "Binary file" | grep -v run.py '
+    )
+    lines = result.split('\n')
+    test_cases = OrderedDict()
+    model_cases = OrderedDict()
+    for line in lines:
+        # "tests/msdatasets/test_ms_dataset.py:92:        model_id = 'damo/bert-base-sst2'"
+        line = line.strip()
+        elements = line.split(':')
+        test_file = elements[0]
+        model_pos = line.find('damo')
+        left_quote = line[model_pos - 1]
+        rquote_idx = line.rfind(left_quote)
+        model_name = line[model_pos:rquote_idx]
+        if test_file not in test_cases:
+            test_cases[test_file] = set()
+        model_info = test_cases[test_file]
+        model_info.add(model_name)
+
+        if model_name not in model_cases:
+            model_cases[model_name] = set()
+        case_info = model_cases[model_name]
+        case_info.add(
+            test_file.replace('tests/', '').replace('.py',
+                                                    '').replace('/', '.'))
+
+    return model_cases
+
+
 _DIST_SCRIPT_TEMPLATE = """
 import ast
 import argparse
diff --git a/tests/run.py b/tests/run.py
index 478cb9d6..51a563fe 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -24,7 +24,9 @@ import torch
 import yaml
 
 from modelscope.utils.logger import get_logger
-from modelscope.utils.test_utils import set_test_level, test_level
+from modelscope.utils.model_tag import ModelTag, commit_model_ut_result
+from modelscope.utils.test_utils import (get_case_model_info, set_test_level,
+                                         test_level)
 
 logger = get_logger()
 
@@ -62,6 +64,23 @@ def statistics_test_result(df):
         result, total_cases, success_cases, failures_cases, error_cases,
         skipped_cases, expected_failure_cases, unexpected_success_cases)
 
+    model_cases = get_case_model_info()
+    for model_name, case_info in model_cases.items():
+        cases = df.loc[df['Name'].str.contains('|'.join(list(case_info)))]
+        results = cases['Result']
+        result = None
+        if any(results == 'Error') or any(results == 'Failures') or any(
+                results == 'UnexpectedSuccesses'):
+            result = ModelTag.MODEL_FAIL
+        elif any(results == 'Success'):
+            result = ModelTag.MODEL_PASS
+        elif all(results == 'Skipped'):
+            result = ModelTag.MODEL_SKIP
+        else:
+            print(f'invalid results for {model_name} \n{result}')
+
+        if result is not None:
+            commit_model_ut_result(model_name, result)
     print('Testing result summary.')
     print(result_msg)
     if result == 'FAILED':

From f7f29ed1ff7bf6b3d4538666e0080792d502ca8c Mon Sep 17 00:00:00 2001
From: "xuangen.hlh" <xuangen.hlh@alibaba-inc.com>
Date: Tue, 6 Sep 2022 20:47:23 +0800
Subject: [PATCH 069/175] =?UTF-8?q?DALL-E=202:=20=E4=BF=AE=E5=A4=8Ddev/dal?=
 =?UTF-8?q?le2=5F1=E5=88=86=E6=94=AF=E9=97=AE=E9=A2=98=EF=BC=8C=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=B5=8B=E8=AF=95=E4=BB=A3=E7=A0=81=EF=BC=8C=E6=9C=AC?=
 =?UTF-8?q?=E5=9C=B0=E6=B5=8B=E8=AF=95=E9=80=9A=E8=BF=87=20=20=20=20=20=20?=
 =?UTF-8?q?=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib?=
 =?UTF-8?q?/codereview/10037492?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |   1 +
 modelscope/models/multi_modal/__init__.py     |   6 +-
 .../multi_stage_diffusion/__init__.py         |   1 +
 .../multi_modal/multi_stage_diffusion/clip.py | 318 +++++++++
 .../multi_stage_diffusion/decoder.py          | 322 +++++++++
 .../gaussian_diffusion.py                     | 641 ++++++++++++++++++
 .../multi_stage_diffusion/model.py            | 265 ++++++++
 .../multi_stage_diffusion/prior.py            | 170 +++++
 .../multi_stage_diffusion/tokenizer.py        | 199 ++++++
 .../multi_stage_diffusion/upsampler.py        | 466 +++++++++++++
 .../multi_modal/multi_stage_diffusion/xglm.py | 205 ++++++
 .../text_to_image_synthesis_pipeline.py       |   7 +-
 tests/pipelines/test_multi_stage_diffusion.py |  40 ++
 13 files changed, 2638 insertions(+), 3 deletions(-)
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/clip.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/model.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/prior.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
 create mode 100644 modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
 create mode 100644 tests/pipelines/test_multi_stage_diffusion.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 22c2d99e..d7217d57 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -72,6 +72,7 @@ class Models(object):
     gemm = 'gemm-generative-multi-modal'
     mplug = 'mplug'
     diffusion = 'diffusion-text-to-image-synthesis'
+    multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
     team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 9219a281..0053da43 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -14,6 +14,8 @@ if TYPE_CHECKING:
     from .ofa_for_all_tasks import OfaForAllTasks
     from .ofa_for_text_to_image_synthesis_model import \
         OfaForTextToImageSynthesis
+    from .multi_stage_diffusion import \
+        MultiStageDiffusionForTextToImageSynthesis
 
 else:
     _import_structure = {
@@ -25,7 +27,9 @@ else:
         'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
         'ofa_for_text_to_image_synthesis_model':
-        ['OfaForTextToImageSynthesis']
+        ['OfaForTextToImageSynthesis'],
+        'multi_stage_diffusion':
+        ['MultiStageDiffusionForTextToImageSynthesis']
     }
 
     import sys
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
new file mode 100644
index 00000000..accbb56e
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
@@ -0,0 +1 @@
+from .model import MultiStageDiffusionForTextToImageSynthesis
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
new file mode 100644
index 00000000..54e971f7
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
@@ -0,0 +1,318 @@
+# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['CLIP']
+
+
+def to_fp16(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        m.weight.data = m.weight.data.half()
+        if m.bias is not None:
+            m.bias.data = m.bias.data.half()
+    elif hasattr(m, 'head'):
+        p = getattr(m, 'head')
+        p.data = p.data.half()
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LayerNorm(nn.LayerNorm):
+    r"""Subclass of nn.LayerNorm to handle fp16.
+    """
+
+    def forward(self, x):
+        return super(LayerNorm, self).forward(x.float()).type_as(x)
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.attn_dropout = nn.Dropout(attn_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(proj_dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [*, L, L].
+        """
+        b, l, _, n = *x.size(), self.num_heads
+
+        # compute query, key, and value
+        q, k, v = self.to_qkv(x.transpose(0, 1)).chunk(3, dim=-1)
+        q = q.reshape(l, b * n, -1).transpose(0, 1)
+        k = k.reshape(l, b * n, -1).transpose(0, 1)
+        v = v.reshape(l, b * n, -1).transpose(0, 1)
+
+        # compute attention
+        attn = self.scale * torch.bmm(q, k.transpose(1, 2))
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        attn = self.attn_dropout(attn)
+
+        # gather context
+        x = torch.bmm(attn, v)
+        x = x.view(b, n, l, -1).transpose(1, 2).reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        x = self.proj_dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4), QuickGELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(proj_dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        assert image_size % patch_size == 0
+        super(VisionTransformer, self).__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_patches = (image_size // patch_size)**2
+
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain * torch.randn(1, self.num_patches + 1, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.pre_norm = LayerNorm(dim)
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim)
+
+        # head
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+    def forward(self, x):
+        b, dtype = x.size(0), self.head.dtype
+        x = x.type(dtype)
+
+        # patch-embedding
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)  # [b, n, c]
+        x = torch.cat([self.cls_embedding.repeat(b, 1, 1).type(dtype), x],
+                      dim=1)
+        x = self.dropout(x + self.pos_embedding.type(dtype))
+        x = self.pre_norm(x)
+
+        # transformer
+        x = self.transformer(x)
+
+        # head
+        x = self.post_norm(x)
+        x = torch.mm(x[:, 0, :], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class TextTransformer(nn.Module):
+
+    def __init__(self,
+                 vocab_size,
+                 text_len,
+                 dim=512,
+                 out_dim=512,
+                 num_heads=8,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(TextTransformer, self).__init__()
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.pos_embedding = nn.Parameter(0.01 * torch.randn(1, text_len, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.transformer = nn.ModuleList([
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = LayerNorm(dim)
+
+        # head
+        gain = 1.0 / math.sqrt(dim)
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+        # causal attention mask
+        self.register_buffer('attn_mask',
+                             torch.tril(torch.ones(1, text_len, text_len)))
+
+    def forward(self, x):
+        eot, dtype = x.argmax(dim=-1), self.head.dtype
+
+        # embeddings
+        x = self.dropout(
+            self.token_embedding(x).type(dtype)
+            + self.pos_embedding.type(dtype))
+
+        # transformer
+        for block in self.transformer:
+            x = block(x, self.attn_mask)
+
+        # head
+        x = self.norm(x)
+        x = torch.mm(x[torch.arange(x.size(0)), eot], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class CLIP(nn.Module):
+
+    def __init__(self,
+                 embed_dim=512,
+                 image_size=224,
+                 patch_size=16,
+                 vision_dim=768,
+                 vision_heads=12,
+                 vision_layers=12,
+                 vocab_size=49408,
+                 text_len=77,
+                 text_dim=512,
+                 text_heads=8,
+                 text_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(CLIP, self).__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+
+    def forward(self, imgs, txt_tokens):
+        r"""imgs:       [B, C, H, W] of torch.float32.
+            txt_tokens: [B, T] of torch.long.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_tokens)
+
+        # normalize features
+        xi = F.normalize(xi, p=2, dim=1)
+        xt = F.normalize(xt, p=2, dim=1)
+
+        # logits
+        scale = self.log_scale.exp()
+        logits_i2t = scale * torch.mm(xi, xt.t())
+        logits_t2i = scale * torch.mm(xt, xi.t())
+        return logits_i2t, logits_t2i
+
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, tsd=0.1)
+
+        # attentions
+        for modality in ['visual', 'textual']:
+            dim = self.vision_dim if modality == 'visual' else 'textual'
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (
+                1.0 / math.sqrt(2 * transformer.num_layers))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer.layers:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+
+    def fp16(self):
+        return self.apply(to_fp16)
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
new file mode 100644
index 00000000..17daedaf
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
@@ -0,0 +1,322 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Decoder']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 in_dim=3,
+                 dim=512,
+                 y_dim=512,
+                 context_dim=512,
+                 out_dim=6,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 2, 1 / 4, 1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Decoder, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y):
+        # embeddings
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
new file mode 100644
index 00000000..a4fc52e0
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
@@ -0,0 +1,641 @@
+# The implementation here is modified based on latent diffusion, publicly available
+# at https://github.com/CompVis/latent-diffusion.
+
+import math
+
+import torch
+
+__all__ = ['GaussianDiffusion', 'beta_schedule']
+
+
+def kl_divergence(mu1, logvar1, mu2, logvar2):
+    u1 = -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2)
+    u2 = ((mu1 - mu2)**2) * torch.exp(-logvar2)
+    return 0.5 * (u1 + u2)
+
+
+def standard_normal_cdf(x):
+    r"""A fast approximation of the cumulative distribution function of the standard normal.
+    """
+    return 0.5 * (1.0 + torch.tanh(
+        math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def discretized_gaussian_log_likelihood(x0, mean, log_scale):
+    assert x0.shape == mean.shape == log_scale.shape
+    cx = x0 - mean
+    inv_stdv = torch.exp(-log_scale)
+    cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
+    cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
+    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x0 < -0.999, log_cdf_plus,
+        torch.where(x0 > 0.999, log_one_minus_cdf_min,
+                    torch.log(cdf_delta.clamp(min=1e-12))))
+    assert log_probs.shape == x0.shape
+    return log_probs
+
+
+def _i(tensor, t, x):
+    r"""Index tensor using t and format the output according to x.
+    """
+    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
+    return tensor[t].view(shape).to(x)
+
+
+def beta_schedule(schedule,
+                  num_timesteps=1000,
+                  init_beta=None,
+                  last_beta=None):
+    if schedule == 'linear':
+        scale = 1000.0 / num_timesteps
+        init_beta = init_beta or scale * 0.0001
+        last_beta = last_beta or scale * 0.02
+        return torch.linspace(
+            init_beta, last_beta, num_timesteps, dtype=torch.float64)
+    elif schedule == 'quadratic':
+        init_beta = init_beta or 0.0015
+        last_beta = last_beta or 0.0195
+        return torch.linspace(
+            init_beta**0.5, last_beta**0.5, num_timesteps,
+            dtype=torch.float64)**2
+    elif schedule == 'cosine':
+        betas = []
+        for step in range(num_timesteps):
+            t1 = step / num_timesteps
+            t2 = (step + 1) / num_timesteps
+            fn_t1 = math.cos((t1 + 0.008) / 1.008 * math.pi / 2)**2
+            fn_t2 = math.cos((t2 + 0.008) / 1.008 * math.pi / 2)**2
+            betas.append(min(1.0 - fn_t2 / fn_t1, 0.999))
+        return torch.tensor(betas, dtype=torch.float64)
+    else:
+        raise ValueError(f'Unsupported schedule: {schedule}')
+
+
+class GaussianDiffusion(object):
+
+    def __init__(self,
+                 betas,
+                 mean_type='eps',
+                 var_type='learned_range',
+                 loss_type='mse',
+                 rescale_timesteps=False):
+        # check input
+        if not isinstance(betas, torch.DoubleTensor):
+            betas = torch.tensor(betas, dtype=torch.float64)
+        assert min(betas) > 0 and max(betas) <= 1
+        assert mean_type in ['x0', 'x_{t-1}', 'eps']
+        assert var_type in [
+            'learned', 'learned_range', 'fixed_large', 'fixed_small'
+        ]
+        assert loss_type in [
+            'mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1'
+        ]
+        self.betas = betas
+        self.num_timesteps = len(betas)
+        self.mean_type = mean_type
+        self.var_type = var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+
+        # alphas
+        alphas = 1 - self.betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat(
+            [alphas.new_ones([1]), self.alphas_cumprod[:-1]])
+        self.alphas_cumprod_next = torch.cat(
+            [self.alphas_cumprod[1:],
+             alphas.new_zeros([1])])
+
+        # q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0
+                                                        - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = torch.log(1.0
+                                                      - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod
+                                                      - 1)
+
+        # q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
+            1.0 - self.alphas_cumprod)
+        self.posterior_log_variance_clipped = torch.log(
+            self.posterior_variance.clamp(1e-20))
+        self.posterior_mean_coef1 = betas * torch.sqrt(
+            self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (
+            1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (
+                1.0 - self.alphas_cumprod)
+
+    def q_sample(self, x0, t, noise=None):
+        r"""Sample from q(x_t | x_0).
+        """
+        noise = torch.randn_like(x0) if noise is None else noise
+        u1 = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        u2 = _i(self.sqrt_one_minus_alphas_cumprod, t, x0) * noise
+        return u1 + u2
+
+    def q_mean_variance(self, x0, t):
+        r"""Distribution of q(x_t | x_0).
+        """
+        mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        var = _i(1.0 - self.alphas_cumprod, t, x0)
+        log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
+        return mu, var, log_var
+
+    def q_posterior_mean_variance(self, x0, xt, t):
+        r"""Distribution of q(x_{t-1} | x_t, x_0).
+        """
+        mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(
+            self.posterior_mean_coef2, t, xt) * xt
+        var = _i(self.posterior_variance, t, xt)
+        log_var = _i(self.posterior_log_variance_clipped, t, xt)
+        return mu, var, log_var
+
+    @torch.no_grad()
+    def p_sample(self,
+                 xt,
+                 t,
+                 model,
+                 model_kwargs={},
+                 clamp=None,
+                 percentile=None,
+                 condition_fn=None,
+                 guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t).
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        # predict distribution of p(x_{t-1} | x_t)
+        mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile,
+                                                    guide_scale)
+
+        # random sample (with optional conditional function)
+        noise = torch.randn_like(xt)
+        shape = (-1, *((1, ) * (xt.ndim - 1)))
+        mask = t.ne(0).float().view(shape)  # no noise when t == 0
+        if condition_fn is not None:
+            grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+            mu = mu.float() + var * grad.float()
+        xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def p_sample_loop(self,
+                      noise,
+                      model,
+                      model_kwargs={},
+                      clamp=None,
+                      percentile=None,
+                      condition_fn=None,
+                      guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
+        """
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        for step in torch.arange(self.num_timesteps).flip(0):
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp,
+                                  percentile, condition_fn, guide_scale)
+        return xt
+
+    def p_mean_variance(self,
+                        xt,
+                        t,
+                        model,
+                        model_kwargs={},
+                        clamp=None,
+                        percentile=None,
+                        guide_scale=None):
+        r"""Distribution of p(x_{t-1} | x_t).
+        """
+        # predict distribution
+        if guide_scale is None:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+        else:
+            # classifier-free guidance
+            # (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
+            u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
+            cond = self.var_type.startswith('fixed')
+            dim = y_out.size(1) if cond else y_out.size(1) // 2
+            u1 = u_out[:, :dim]
+            u2 = guide_scale * (y_out[:, :dim] - u_out[:, :dim])
+            out = torch.cat([u1 + u2, y_out[:, dim:]], dim=1)
+
+        # compute variance
+        if self.var_type == 'learned':
+            out, log_var = out.chunk(2, dim=1)
+            var = torch.exp(log_var)
+        elif self.var_type == 'learned_range':
+            out, fraction = out.chunk(2, dim=1)
+            min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
+            max_log_var = _i(torch.log(self.betas), t, xt)
+            fraction = (fraction + 1) / 2.0
+            log_var = fraction * max_log_var + (1 - fraction) * min_log_var
+            var = torch.exp(log_var)
+        elif self.var_type == 'fixed_large':
+            var = _i(
+                torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t,
+                xt)
+            log_var = torch.log(var)
+        elif self.var_type == 'fixed_small':
+            var = _i(self.posterior_variance, t, xt)
+            log_var = _i(self.posterior_log_variance_clipped, t, xt)
+
+        # compute mean and x0
+        if self.mean_type == 'x_{t-1}':
+            mu = out  # x_{t-1}
+            u1 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu
+            u2 = _i(self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
+                    xt) * xt
+            x0 = u1 - u2
+        elif self.mean_type == 'x0':
+            x0 = out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        elif self.mean_type == 'eps':
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * out
+            x0 = u1 - u2
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+
+        # restrict the range of x0
+        if percentile is not None:
+            assert percentile > 0 and percentile <= 1  # e.g., 0.995
+            s = torch.quantile(
+                x0.flatten(1).abs(), percentile,
+                dim=1).clamp_(1.0).view(-1, 1, 1, 1)
+            x0 = torch.min(s, torch.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clamp(-clamp, clamp)
+        return mu, var, log_var, x0
+
+    @torch.no_grad()
+    def ddim_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    ddim_timesteps=20,
+                    eta=0.0):
+        r"""Sample from p(x_{t-1} | x_t) using DDIM.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+        if condition_fn is not None:
+            # x0 -> eps
+            alpha = _i(self.alphas_cumprod, t, xt)
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            eps = eps - (1 - alpha).sqrt() * condition_fn(
+                xt, self._scale_timesteps(t), **model_kwargs)
+
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+        alphas = _i(self.alphas_cumprod, t, xt)
+        alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+        u1 = (1 - alphas_prev) / (1 - alphas)
+        u2 = (1 - alphas / alphas_prev)
+        sigmas = eta * torch.sqrt(u1 * u2)
+
+        # random sample
+        noise = torch.randn_like(xt)
+        direction = torch.sqrt(1 - alphas_prev - sigmas**2) * eps
+        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+        xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def ddim_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         ddim_timesteps=20,
+                         eta=0.0):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // ddim_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp,
+                                     percentile, condition_fn, guide_scale,
+                                     ddim_timesteps, eta)
+        return xt
+
+    @torch.no_grad()
+    def ddim_reverse_sample(self,
+                            xt,
+                            t,
+                            model,
+                            model_kwargs={},
+                            clamp=None,
+                            percentile=None,
+                            guide_scale=None,
+                            ddim_timesteps=20):
+        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+
+        alphas_next = _i(
+            torch.cat(
+                [self.alphas_cumprod,
+                 self.alphas_cumprod.new_zeros([1])]),
+            (t + stride).clamp(0, self.num_timesteps), xt)
+
+        # reverse sample
+        mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
+        return mu, x0
+
+    @torch.no_grad()
+    def ddim_reverse_sample_loop(self,
+                                 x0,
+                                 model,
+                                 model_kwargs={},
+                                 clamp=None,
+                                 percentile=None,
+                                 guide_scale=None,
+                                 ddim_timesteps=20):
+        # prepare input
+        b = x0.size(0)
+        xt = x0
+
+        # reconstruction steps
+        steps = torch.arange(0, self.num_timesteps,
+                             self.num_timesteps // ddim_timesteps)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp,
+                                             percentile, guide_scale,
+                                             ddim_timesteps)
+        return xt
+
+    @torch.no_grad()
+    def plms_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    plms_timesteps=20):
+        r"""Sample from p(x_{t-1} | x_t) using PLMS.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // plms_timesteps
+
+        # function for compute eps
+        def compute_eps(xt, t):
+            # predict distribution of p(x_{t-1} | x_t)
+            _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                               clamp, percentile, guide_scale)
+
+            # condition
+            if condition_fn is not None:
+                # x0 -> eps
+                alpha = _i(self.alphas_cumprod, t, xt)
+                u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+                eps = u1 / u2
+                eps = eps - (1 - alpha).sqrt() * condition_fn(
+                    xt, self._scale_timesteps(t), **model_kwargs)
+
+                # eps -> x0
+                u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+                x0 = u1 - u2
+
+            # derive eps
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            return eps
+
+        # function for compute x_0 and x_{t-1}
+        def compute_x0(eps, t):
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+            # deterministic sample
+            alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+            direction = torch.sqrt(1 - alphas_prev) * eps
+            # mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+            xt_1 = torch.sqrt(alphas_prev) * x0 + direction
+            return xt_1, x0
+
+        # PLMS sample
+        eps = compute_eps(xt, t)
+        if len(eps_cache) == 0:
+            # 2nd order pseudo improved Euler
+            xt_1, x0 = compute_x0(eps, t)
+            eps_next = compute_eps(xt_1, (t - stride).clamp(0))
+            eps_prime = (eps + eps_next) / 2.0
+        elif len(eps_cache) == 1:
+            # 2nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (3 * eps - eps_cache[-1]) / 2.0
+        elif len(eps_cache) == 2:
+            # 3nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (23 * eps - 16 * eps_cache[-1]
+                         + 5 * eps_cache[-2]) / 12.0
+        elif len(eps_cache) >= 3:
+            # 4nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2]
+                         - 9 * eps_cache[-3]) / 24.0
+        xt_1, x0 = compute_x0(eps_prime, t)
+        return xt_1, x0, eps
+
+    @torch.no_grad()
+    def plms_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         plms_timesteps=20):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // plms_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        eps_cache = []
+        for step in steps:
+            # PLMS sampling step
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp,
+                                          percentile, condition_fn,
+                                          guide_scale, plms_timesteps,
+                                          eps_cache)
+
+            # update eps cache
+            eps_cache.append(eps)
+            if len(eps_cache) >= 4:
+                eps_cache.pop(0)
+        return xt
+
+    def loss(self, x0, t, model, model_kwargs={}, noise=None, input_x0=None):
+        noise = torch.randn_like(x0) if noise is None else noise
+        input_x0 = x0 if input_x0 is None else input_x0
+        xt = self.q_sample(input_x0, t, noise=noise)
+
+        # compute loss
+        if self.loss_type in ['kl', 'rescaled_kl']:
+            loss, _ = self.variational_lower_bound(x0, xt, t, model,
+                                                   model_kwargs)
+            if self.loss_type == 'rescaled_kl':
+                loss = loss * self.num_timesteps
+        elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+
+            # VLB for variation
+            loss_vlb = 0.0
+            if self.var_type in ['learned', 'learned_range']:
+                out, var = out.chunk(2, dim=1)
+                frozen = torch.cat([
+                    out.detach(), var
+                ], dim=1)  # learn var without affecting the prediction of mean
+                loss_vlb, _ = self.variational_lower_bound(
+                    x0, xt, t, model=lambda *args, **kwargs: frozen)
+                if self.loss_type.startswith('rescaled_'):
+                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0
+
+            # MSE/L1 for x0/eps
+            target = {
+                'eps': noise,
+                'x0': x0,
+                'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]
+            }[self.mean_type]
+            loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2
+                                      ).abs().flatten(1).mean(dim=1)
+
+            # total loss
+            loss = loss + loss_vlb
+        return loss
+
+    def variational_lower_bound(self,
+                                x0,
+                                xt,
+                                t,
+                                model,
+                                model_kwargs={},
+                                clamp=None,
+                                percentile=None):
+        # compute groundtruth and predicted distributions
+        mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
+        mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile)
+
+        # compute KL loss
+        kl = kl_divergence(mu1, log_var1, mu2, log_var2)
+        kl = kl.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # compute discretized NLL loss (for p(x0 | x1) only)
+        nll = -discretized_gaussian_log_likelihood(
+            x0, mean=mu2, log_scale=0.5 * log_var2)
+        nll = nll.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # NLL for p(x0 | x1) and KL otherwise
+        vlb = torch.where(t == 0, nll, kl)
+        return vlb, x0
+
+    @torch.no_grad()
+    def variational_lower_bound_loop(self,
+                                     x0,
+                                     model,
+                                     model_kwargs={},
+                                     clamp=None,
+                                     percentile=None):
+        r"""Compute the entire variational lower bound, measured in bits-per-dim.
+        """
+        # prepare input and output
+        b = x0.size(0)
+        metrics = {'vlb': [], 'mse': [], 'x0_mse': []}
+
+        # loop
+        for step in torch.arange(self.num_timesteps).flip(0):
+            # compute VLB
+            t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
+            noise = torch.randn_like(x0)
+            xt = self.q_sample(x0, t, noise)
+            vlb, pred_x0 = self.variational_lower_bound(
+                x0, xt, t, model, model_kwargs, clamp, percentile)
+
+            # predict eps from x0
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+
+            # collect metrics
+            metrics['vlb'].append(vlb)
+            metrics['x0_mse'].append(
+                (pred_x0 - x0).square().flatten(1).mean(dim=1))
+            metrics['mse'].append(
+                (eps - noise).square().flatten(1).mean(dim=1))
+        metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}
+
+        # compute the prior KL term for VLB, measured in bits-per-dim
+        mu, _, log_var = self.q_mean_variance(x0, t)
+        kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu),
+                                 torch.zeros_like(log_var))
+        kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # update metrics
+        metrics['prior_bits_per_dim'] = kl_prior
+        metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
+        return metrics
+
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * 1000.0 / self.num_timesteps
+        return t
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
new file mode 100644
index 00000000..c2d83b34
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -0,0 +1,265 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.multi_stage_diffusion.clip import CLIP
+from modelscope.models.multi_modal.multi_stage_diffusion.decoder import Decoder
+from modelscope.models.multi_modal.multi_stage_diffusion.gaussian_diffusion import (
+    GaussianDiffusion, beta_schedule)
+from modelscope.models.multi_modal.multi_stage_diffusion.prior import Prior
+from modelscope.models.multi_modal.multi_stage_diffusion.tokenizer import (
+    CLIPTokenizer, XGLMTokenizer)
+from modelscope.models.multi_modal.multi_stage_diffusion.upsampler import (
+    Upsampler256, Upsampler1024)
+from modelscope.models.multi_modal.multi_stage_diffusion.xglm import XGLM
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['MultiStageDiffusionForTextToImageSynthesis']
+
+
+def make_diffusion(schedule,
+                   num_timesteps=1000,
+                   init_beta=None,
+                   last_beta=None,
+                   mean_type='eps',
+                   var_type='fixed_small'):
+    betas = beta_schedule(schedule, num_timesteps, init_beta, last_beta)
+    diffusion = GaussianDiffusion(
+        betas, mean_type=mean_type, var_type=var_type)
+    return diffusion
+
+
+class UnCLIP(nn.Module):
+
+    def __init__(self, model_dir):
+        super(UnCLIP, self).__init__()
+        self.model_dir = model_dir
+        self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}'))
+
+        # modules
+        self.clip = CLIP(**self.config['clip']).fp16()
+        self.xglm = XGLM(**self.config['xglm'])
+        self.prior = Prior(**self.config['prior'])
+        self.decoder = Decoder(**self.config['decoder'])
+        self.upsampler256 = Upsampler256(**self.config['upsampler256'])
+        self.upsampler1024 = Upsampler1024(**self.config['upsampler1024'])
+
+        # diffusions
+        self.prior_diffusion = make_diffusion(**self.config['prior_diffusion'])
+        self.decoder_diffusion = make_diffusion(
+            **self.config['decoder_diffusion'])
+        self.upsampler256_diffusion = make_diffusion(
+            **self.config['upsampler256_diffusion'])
+        self.upsampler1024_diffusion = make_diffusion(
+            **self.config['upsampler1024_diffusion'])
+
+        # tokenizers
+        self.clip_tokenizer = CLIPTokenizer(
+            bpe_path=f'{model_dir}/bpe_simple_vocab_16e6.txt.gz')
+        self.xglm_tokenizer = XGLMTokenizer(model_dir=model_dir)
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            '"forward" is not implemented. Use "synthesis" instead.')
+
+    @torch.no_grad()
+    def synthesis(self,
+                  text='A photo of a confused grizzly bear in calculus class.',
+                  tokenizer='clip',
+                  batch_size=4,
+                  timesteps_prior=100,
+                  timesteps_64=50,
+                  timesteps_256=20,
+                  timesteps_1024=20,
+                  guide_prior=3.0,
+                  guide_64=7.0,
+                  guide_256=3.0,
+                  guide_1024=3.0,
+                  eta_prior=0.0,
+                  eta_64=0.0,
+                  eta_256=0.0,
+                  eta_1024=0.0):
+        device = next(self.parameters()).device
+
+        # check params
+        assert all([
+            t > 0 and t <= 1000 for t in
+            [timesteps_prior, timesteps_64, timesteps_256, timesteps_1024]
+        ])
+        assert all([
+            g > 1 and g < 15
+            for g in [guide_prior, guide_64, guide_256, guide_1024]
+        ])
+        assert all([
+            e >= 0 and e <= 1.0
+            for e in [eta_prior, eta_64, eta_256, eta_1024]
+        ])
+        assert batch_size >= 1 and batch_size <= 16
+
+        # tokenize the text
+        if tokenizer == 'clip':
+            y = F.normalize(
+                self.clip.textual(self.clip_tokenizer([text]).to(device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.clip.textual(self.clip_tokenizer(['']).to(device)),
+                p=2,
+                dim=1)
+        elif tokenizer == 'xglm':
+            y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer([text]), device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer(['']), device)),
+                p=2,
+                dim=1)
+        else:
+            raise ValueError(
+                f'Expected tokenizer to be one of "clip" or "xglm", but got {tokenizer}'
+            )
+        y = math.sqrt(y.size(1)) * y.repeat(batch_size, 1)
+        zero_y = math.sqrt(zero_y.size(1)) * zero_y.repeat(batch_size, 1)
+
+        # synthesis
+        with amp.autocast(enabled=True):
+            # prior
+            x0 = self.prior_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(y),
+                model=self.prior,
+                model_kwargs=[{
+                    'y': y
+                }, {
+                    'y': zero_y
+                }],
+                guide_scale=guide_prior,
+                ddim_timesteps=timesteps_prior,
+                eta=eta_prior)
+
+            # decoder
+            imgs64 = self.decoder_diffusion.ddim_sample_loop(
+                noise=torch.randn(batch_size, 3, 64, 64).to(device),
+                model=self.decoder,
+                model_kwargs=[{
+                    'y': x0
+                }, {
+                    'y': torch.zeros_like(x0)
+                }],
+                guide_scale=guide_64,
+                percentile=0.995,
+                ddim_timesteps=timesteps_64,
+                eta=eta_64).clamp_(-1, 1)
+
+            # upsampler256
+            imgs256 = F.interpolate(
+                imgs64, scale_factor=4.0, mode='bilinear', align_corners=False)
+            imgs256 = self.upsampler256_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs256),
+                model=self.upsampler256,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs256
+                }, {
+                    'y': zero_y,
+                    'concat': imgs256
+                }],
+                guide_scale=guide_256,
+                percentile=0.995,
+                ddim_timesteps=timesteps_256,
+                eta=eta_256).clamp_(-1, 1)
+
+            # upsampler1024
+            imgs1024 = F.interpolate(
+                imgs256,
+                scale_factor=4.0,
+                mode='bilinear',
+                align_corners=False)
+            imgs1024 = self.upsampler1024_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs1024),
+                model=self.upsampler1024,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs1024
+                }, {
+                    'y': zero_y,
+                    'concat': imgs1024
+                }],
+                guide_scale=guide_1024,
+                percentile=0.995,
+                ddim_timesteps=timesteps_1024,
+                eta=eta_1024).clamp_(-1, 1)
+
+        # output ([B, C, H, W] within range [0, 1])
+        imgs1024 = imgs1024.add_(1).mul_(255 / 2.0).permute(0, 2, 3, 1).cpu()
+        imgs1024 = [
+            Image.fromarray(np.array(u, dtype=np.uint8)) for u in imgs1024
+        ]
+        return imgs1024
+
+
+@MODELS.register_module(
+    Tasks.text_to_image_synthesis, module_name=Models.multi_stage_diffusion)
+class MultiStageDiffusionForTextToImageSynthesis(TorchModel):
+
+    def __init__(self, model_dir, device_id=-1):
+        super().__init__(model_dir=model_dir, device_id=device_id)
+        model = UnCLIP(model_dir=model_dir)
+        pretrained_params = torch.load(
+            osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
+        model.load_state_dict(pretrained_params)
+        model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0:
+            self.device = torch.device(f'cuda:{self.device_id}')
+            model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device = torch.device('cpu')
+            logger.info('Use CPU for inference')
+        self.model = model
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(input, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(input)}'
+            )
+        if 'text' not in input:
+            raise ValueError('input should contain "text", but not found')
+
+        # ddim sampling
+        imgs = self.model.synthesis(
+            text=input.get('text'),
+            tokenizer=input.get('tokenizer', 'clip'),
+            batch_size=input.get('batch_size', 4),
+            timesteps_prior=input.get('timesteps_prior', 100),
+            timesteps_64=input.get('timesteps_64', 50),
+            timesteps_256=input.get('timesteps_256', 20),
+            timesteps_1024=input.get('timesteps_1024', 20),
+            guide_prior=input.get('guide_prior', 3.0),
+            guide_64=input.get('guide_64', 7.0),
+            guide_256=input.get('guide_256', 3.0),
+            guide_1024=input.get('guide_1024', 3.0),
+            eta_prior=input.get('eta_prior', 0.0),
+            eta_64=input.get('eta_64', 0.0),
+            eta_256=input.get('eta_256', 0.0),
+            eta_1024=input.get('eta_1024', 0.0))
+        imgs = [np.array(u)[..., ::-1] for u in imgs]
+        return imgs
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
new file mode 100644
index 00000000..380fa467
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
@@ -0,0 +1,170 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Prior']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = math.pow(self.head_dim, -0.25)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, mask):
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, l, n * 3, c).chunk(3, dim=2)
+
+        # compute attention
+        attn = torch.einsum('binc,bjnc->bnij', q * self.scale, k * self.scale)
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type(attn.dtype)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class Prior(nn.Module):
+
+    def __init__(self, dim=2048, clip_dim=768, num_heads=32, num_layers=24):
+        super(Prior, self).__init__()
+        self.dim = dim
+        self.clip_dim = clip_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.text_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.vision_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.eos_embedding = nn.Parameter(torch.zeros(1, 1, dim))
+        self.pos_embedding = nn.Parameter(torch.zeros(1, 4, dim))
+
+        # transformer
+        self.blocks = nn.ModuleList(
+            [AttentionBlock(dim, num_heads) for _ in range(num_layers)])
+        self.norm = nn.LayerNorm(dim)
+
+        # head
+        self.head = nn.Linear(dim, clip_dim)
+
+        # causal attention mask
+        self.register_buffer('attn_mask', torch.tril(torch.ones(1, 1, 4, 4)))
+
+        # initialize weights
+        self.init_weights()
+
+    def forward(self, x, t, y):
+        r"""x:      [B, C].
+            t:      [B].
+            y:      [B, C].
+        """
+        b = x.size(0)
+
+        # embeddings of shape [B, L + 4, C]
+        u1 = sinusoidal_embedding(t, self.dim)
+        u2 = [
+            self.text_embedding(y).unsqueeze(1),
+            self.time_embedding(u1).unsqueeze(1),
+            self.vision_embedding(x).unsqueeze(1),
+            self.eos_embedding.repeat(b, 1, 1)
+        ]
+        x = self.pos_embedding + torch.cat(u2, dim=1)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, self.attn_mask)
+        x = self.norm(x)
+
+        # head
+        x = self.head(x[:, -1])
+        return x
+
+    def init_weights(self):
+        std = 0.02 / math.sqrt(2.0 * self.num_layers)
+        for name, m in self.named_modules():
+            if name.endswith('attn.proj') or name.endswith('ffn.2'):
+                # smaller std for output layers
+                nn.init.normal_(m.weight, std=std)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.Linear, nn.Embedding)):
+                nn.init.normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay':
+            0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
new file mode 100644
index 00000000..6fd9bebe
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
@@ -0,0 +1,199 @@
+# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+
+import gzip
+import html
+from functools import lru_cache
+
+import ftfy
+import regex as re
+import torch
+from transformers import AutoTokenizer
+
+__all__ = ['CLIPTokenizer', 'XGLMTokenizer']
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+class CLIPTokenizer(object):
+    r"""CLIP tokenizer, adapted from https://github.com/openai/CLIP.
+    """
+
+    def __init__(self, bpe_path, length=77):
+        self.bpe_path = bpe_path
+        self.length = length
+
+        # init tokenizer
+        self.tokenizer = SimpleTokenizer(bpe_path=bpe_path)
+        self.sos_token = self.tokenizer.encoder['<|startoftext|>']
+        self.eos_token = self.tokenizer.encoder['<|endoftext|>']
+        self.vocab_size = len(self.tokenizer.encoder)
+
+    def __call__(self, sequence):
+        if isinstance(sequence, str):
+            return torch.LongTensor(self._tokenizer(sequence))
+        elif isinstance(sequence, list):
+            return torch.LongTensor([self._tokenizer(u) for u in sequence])
+        else:
+            raise TypeError(
+                f'Expected the "sequence" to be a string or a list, but got {type(sequence)}'
+            )
+
+    def _tokenizer(self, text):
+        tokens = self.tokenizer.encode(text)[:self.length - 2]
+        tokens = [self.sos_token] + tokens + [self.eos_token]
+        tokens = tokens + [0] * (self.length - len(tokens))
+        return tokens
+
+
+class XGLMTokenizer(object):
+    r"""A wrapper of HuggingFace's XGLM tokenizer.
+    """
+
+    def __init__(self, model_dir, length=77, **kwargs):
+        self.length = length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+
+    def __call__(self, sequence, **kwargs):
+        _kwargs = {
+            'return_tensors': 'pt',
+            'padding': 'max_length',
+            'truncation': True,
+            'max_length': self.length
+        }
+        _kwargs.update(**kwargs)
+        tokens = self.tokenizer(sequence, **_kwargs)
+        return tokens.input_ids, tokens.attention_mask
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
new file mode 100644
index 00000000..4e99a514
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
@@ -0,0 +1,466 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Upsampler256', 'Upsampler1024']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Upsampler256(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=320,
+                 y_dim=768,
+                 context_dim=512,
+                 out_dim=3,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Upsampler256, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embeddings
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
+
+
+class Upsampler1024(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=192,
+                 y_dim=768,
+                 out_dim=3,
+                 dim_mult=[1, 1, 2, 2, 4, 4],
+                 num_res_blocks=2,
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.0):
+        embed_dim = dim * 4
+        super(Upsampler1024, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embedding
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                shortcut_dims.append(out_dim)
+                in_dim = out_dim
+                self.encoder.append(block)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embedding
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
new file mode 100644
index 00000000..8a0b3ff1
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
@@ -0,0 +1,205 @@
+# The implementation here is modified based on HuggingFace XGLM, publicly available
+# at https://github.com/huggingface/transformers.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['XGLM']
+
+
+def sinusoidal_embedding(seq_len, dim, pad_token=None):
+    half = dim // 2
+    sinusoid = torch.outer(
+        torch.arange(seq_len, dtype=torch.float32),
+        torch.pow(10000,
+                  -torch.arange(half, dtype=torch.float32).div(half - 1)))
+    x = torch.cat([torch.sin(sinusoid), torch.cos(sinusoid)], dim=1)
+    if dim % 2 == 1:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    if pad_token is not None:
+        x[pad_token, :] = 0
+    return x
+
+
+class SinusoidalEmbedding(nn.Module):
+
+    def __init__(self, seq_len, dim, pad_token):
+        super(SinusoidalEmbedding, self).__init__()
+        self.seq_len = seq_len
+        self.dim = dim
+        self.pad_token = pad_token
+        self.register_buffer('weight',
+                             sinusoidal_embedding(seq_len + 2, dim, pad_token))
+
+    def forward(self, tokens):
+        mask = tokens.ne(self.pad_token).long()
+        indices = torch.cumsum(mask, dim=1) * mask + self.pad_token
+        pos_embeds = self.weight.index_select(0, indices.view(-1)).view(
+            *tokens.shape, -1)
+        return pos_embeds
+
+
+class GELU(nn.Module):
+
+    def forward(self, x):
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, dropout=0.1):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [B, *, L, L] or None.
+        """
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).view(b, l, n, c)
+        k = self.k(x).view(b, l, n, c)
+        v = self.v(x).view(b, l, n, c)
+
+        # compute attention
+        attn = self.scale * torch.einsum('binc,bjnc->bnij', q, k)
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, ffn_dim, ffn_act, num_heads, dropout=0.1):
+        assert ffn_act in ['gelu', 'relu']
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            GELU() if ffn_act == 'gelu' else nn.ReLU(inplace=True),
+            nn.Linear(ffn_dim, dim), nn.Dropout(dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class XGLM(nn.Module):
+    r"""A multilingual GPT model with an embedding head.
+    """
+
+    def __init__(self,
+                 vocab_size=256008,
+                 max_seq_len=2048,
+                 dim=1024,
+                 ffn_dim=4096,
+                 ffn_act='gelu',
+                 embed_dim=768,
+                 num_heads=16,
+                 num_layers=24,
+                 pad_token=1,
+                 dropout=0.1):
+        super(XGLM, self).__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pad_token = pad_token
+        self.scale = math.sqrt(dim)  # rescale token embedings
+
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim, pad_token)
+        self.pos_embedding = SinusoidalEmbedding(max_seq_len, dim, pad_token)
+        self.eos_embedding = nn.Parameter(torch.randn(1, 1, dim))
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, ffn_dim, ffn_act, num_heads, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, embed_dim, bias=False)
+
+        # causal attention mask
+        self.register_buffer(
+            'attn_mask',
+            torch.tril(torch.ones(1, 1, 1 + max_seq_len, 1 + max_seq_len)))
+
+        # init weights
+        self.apply(self.init_weights)
+
+    def forward(self, tokens, mask=None):
+        r"""tokens: [B, L].
+            mask:   [B, L].
+        """
+        b, seq_len = tokens.size(0), 1 + tokens.size(1)
+
+        # embeddings
+        x = self.scale * self.token_embedding(tokens)
+        x = torch.cat([x, self.eos_embedding.repeat(b, 1, 1)], dim=1)
+        # x = x + self.pos_embedding(tokens)
+        x = self.dropout(x)
+
+        # attention mask
+        if mask is None:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len].repeat(b, 1, 1, 1)
+        else:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len] * torch.cat(
+                [mask, torch.zeros_like(mask[:, :1])], dim=1).view(
+                    b, 1, 1, seq_len)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+
+        # head
+        logits = self.head(x[:, -1])
+        return logits
+
+    def init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.padding_idx is not None:
+                nn.init.zeros_(m.weight[m.padding_idx])
diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
index 406538cf..f402cc29 100644
--- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
@@ -3,7 +3,8 @@ from typing import Any, Dict, Optional
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForTextToImageSynthesis
+from modelscope.models.multi_modal import (
+    MultiStageDiffusionForTextToImageSynthesis, OfaForTextToImageSynthesis)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -48,7 +49,9 @@ class TextToImageSynthesisPipeline(Pipeline):
             return input
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForTextToImageSynthesis):
+        if isinstance(self.model,
+                      (OfaForTextToImageSynthesis,
+                       MultiStageDiffusionForTextToImageSynthesis)):
             return self.model(input)
         return self.model.generate(input)
 
diff --git a/tests/pipelines/test_multi_stage_diffusion.py b/tests/pipelines/test_multi_stage_diffusion.py
new file mode 100644
index 00000000..f4e63ce0
--- /dev/null
+++ b/tests/pipelines/test_multi_stage_diffusion.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+import torch
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiStageDiffusionTest(unittest.TestCase):
+    model_id = 'damo/cv_diffusion_text-to-image-synthesis'
+    test_text = {'text': 'Photograph of a baby chicken wearing sunglasses'}
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=model)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_name(self):
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=self.model_id)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+
+if __name__ == '__main__':
+    unittest.main()

From fabb4716d4d033f7664ff7b38d9f815d3676f5c6 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 6 Sep 2022 21:47:59 +0800
Subject: [PATCH 070/175] [to #44610931] fix: add device usage when device is
 None or empty

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10039848

    * add device usage when device is None or empty

    * update docker env
---
 .dev_scripts/dockerci.sh   | 1 +
 modelscope/utils/device.py | 2 +-
 tests/utils/test_device.py | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index e76f2f14..af94b211 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -36,6 +36,7 @@ do
              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
              -e TEST_LEVEL=$TEST_LEVEL \
              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+             -e MODEL_TAG_URL=$MODEL_TAG_URL \
              --workdir=$CODE_DIR_IN_CONTAINER \
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 40804970..33c0910d 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -19,9 +19,9 @@ def verify_device(device_name):
     Return:
         device info (tuple):  device_type and device_id, if device_id is not set, will use 0 as default.
     """
-    device_name = device_name.lower()
     err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
     assert device_name is not None and device_name != '', err_msg
+    device_name = device_name.lower()
     eles = device_name.split(':')
     assert len(eles) <= 2, err_msg
     assert device_name is not None
diff --git a/tests/utils/test_device.py b/tests/utils/test_device.py
index 4def9915..0d334fda 100644
--- a/tests/utils/test_device.py
+++ b/tests/utils/test_device.py
@@ -50,6 +50,12 @@ class DeviceTest(unittest.TestCase):
         with self.assertRaises(AssertionError):
             verify_device('xgu')
 
+        with self.assertRaises(AssertionError):
+            verify_device('')
+
+        with self.assertRaises(AssertionError):
+            verify_device(None)
+
     def test_create_device_torch(self):
         if torch.cuda.is_available():
             target_device_type = 'cuda'

From d38076b07211960942318b314b98f5203016036a Mon Sep 17 00:00:00 2001
From: "eniac.xcw" <eniac.xcw@alibaba-inc.com>
Date: Tue, 6 Sep 2022 22:18:51 +0800
Subject: [PATCH 071/175] makes test code clear         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10041992

---
 data/test/images/multimodal_similarity.jpg    |  3 +++
 .../pipelines/test_multi_modal_similarity.py  | 26 ++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)
 create mode 100644 data/test/images/multimodal_similarity.jpg

diff --git a/data/test/images/multimodal_similarity.jpg b/data/test/images/multimodal_similarity.jpg
new file mode 100644
index 00000000..70a2b844
--- /dev/null
+++ b/data/test/images/multimodal_similarity.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f24abbba43782d733dedbb0b4f416635af50263862e5632963ac9263e430555
+size 88542
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
index d1d6a7a8..192602b4 100644
--- a/tests/pipelines/test_multi_modal_similarity.py
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -10,32 +10,38 @@ from modelscope.utils.test_utils import test_level
 
 class MultiModalSimilarityTest(unittest.TestCase):
     model_id = 'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'
-    test_input = {
-        'img': 'data/test/images/generative_multimodal.jpg',
-        'text': '起居室照片'
-    }
+    test_img = 'data/test/images/multimodal_similarity.jpg'
+    test_str1 = '一个上了年纪的女人在城镇中骑着自行车一个黄色出租车正要从她身边驶过'
+    test_str2 = '穿着蓝色连衣裙的那个女人正冲着行来的车辆伸出她的手'
+
+    def infer_pipeline(self, multi_modal_similarity_pipeline):
+        test_input1 = {'img': self.test_img, 'text': self.test_str1}
+        test_input2 = {'img': self.test_img, 'text': self.test_str2}
+        output1 = multi_modal_similarity_pipeline(test_input1)
+        output2 = multi_modal_similarity_pipeline(test_input2)
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str1, output1['scores']))
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str2, output2['scores']))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         multi_modal_similarity_pipeline = pipeline(
             Tasks.multi_modal_similarity, model=self.model_id)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         multi_modal_similarity_pipeline = pipeline(
             task=Tasks.multi_modal_similarity)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
         multi_modal_similarity_pipeline = pipeline(
             task=Tasks.multi_modal_similarity, model=model)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
 
 if __name__ == '__main__':

From c12957a9eb0753b61285cfa44f0d34d72b3e52ba Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 6 Sep 2022 22:53:55 +0800
Subject: [PATCH 072/175] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EMt?=
 =?UTF-8?q?cnn=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 完成Maas-cv CR标准 自查
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9951519

    * [to #42322933] 新增Mtcnn人脸检测器
---
 data/test/images/mtcnn_face_detection.jpg     |   3 +
 modelscope/metainfo.py                        |   2 +
 .../models/cv/face_detection/__init__.py      |   5 +-
 .../cv/face_detection/mtcnn/__init__.py       |   1 +
 .../face_detection/mtcnn/models/__init__.py   |   0
 .../face_detection/mtcnn/models/box_utils.py  | 240 ++++++++++++++++++
 .../face_detection/mtcnn/models/detector.py   | 149 +++++++++++
 .../mtcnn/models/first_stage.py               | 100 ++++++++
 .../face_detection/mtcnn/models/get_nets.py   | 160 ++++++++++++
 modelscope/pipelines/cv/__init__.py           |   4 +-
 .../cv/mtcnn_face_detection_pipeline.py       |  56 ++++
 tests/pipelines/test_mtcnn_face_detection.py  |  38 +++
 12 files changed, 756 insertions(+), 2 deletions(-)
 create mode 100644 data/test/images/mtcnn_face_detection.jpg
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/models/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/models/detector.py
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
 create mode 100644 modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
 create mode 100644 modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
 create mode 100644 tests/pipelines/test_mtcnn_face_detection.py

diff --git a/data/test/images/mtcnn_face_detection.jpg b/data/test/images/mtcnn_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mtcnn_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index d7217d57..d7594794 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -35,6 +35,7 @@ class Models(object):
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
+    mtcnn = 'mtcnn'
     ulfd = 'ulfd'
 
     # EasyCV models
@@ -127,6 +128,7 @@ class Pipelines(object):
     ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
+    mtcnn_face_detection = 'manual-face-detection-mtcnn'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index 63ff1b83..ed8832c2 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -4,12 +4,15 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .mtcnn import MtcnnFaceDetector
     from .retinaface import RetinaFaceDetection
     from .ulfd_slim import UlfdFaceDetector
+
 else:
     _import_structure = {
         'ulfd_slim': ['UlfdFaceDetector'],
-        'retinaface': ['RetinaFaceDetection']
+        'retinaface': ['RetinaFaceDetection'],
+        'mtcnn': ['MtcnnFaceDetector']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/mtcnn/__init__.py b/modelscope/models/cv/face_detection/mtcnn/__init__.py
new file mode 100644
index 00000000..b11c4740
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/__init__.py
@@ -0,0 +1 @@
+from .models.detector import MtcnnFaceDetector
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/__init__.py b/modelscope/models/cv/face_detection/mtcnn/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
new file mode 100644
index 00000000..f6a27b05
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
@@ -0,0 +1,240 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import numpy as np
+from PIL import Image
+
+
+def nms(boxes, overlap_threshold=0.5, mode='union'):
+    """Non-maximum suppression.
+
+    Arguments:
+        boxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        overlap_threshold: a float number.
+        mode: 'union' or 'min'.
+
+    Returns:
+        list with indices of the selected boxes
+    """
+
+    # if there are no boxes, return the empty list
+    if len(boxes) == 0:
+        return []
+
+    # list of picked indices
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)]
+
+    area = (x2 - x1 + 1.0) * (y2 - y1 + 1.0)
+    ids = np.argsort(score)  # in increasing order
+
+    while len(ids) > 0:
+
+        # grab index of the largest value
+        last = len(ids) - 1
+        i = ids[last]
+        pick.append(i)
+
+        # compute intersections
+        # of the box with the largest score
+        # with the rest of boxes
+
+        # left top corner of intersection boxes
+        ix1 = np.maximum(x1[i], x1[ids[:last]])
+        iy1 = np.maximum(y1[i], y1[ids[:last]])
+
+        # right bottom corner of intersection boxes
+        ix2 = np.minimum(x2[i], x2[ids[:last]])
+        iy2 = np.minimum(y2[i], y2[ids[:last]])
+
+        # width and height of intersection boxes
+        w = np.maximum(0.0, ix2 - ix1 + 1.0)
+        h = np.maximum(0.0, iy2 - iy1 + 1.0)
+
+        # intersections' areas
+        inter = w * h
+        if mode == 'min':
+            overlap = inter / np.minimum(area[i], area[ids[:last]])
+        elif mode == 'union':
+            # intersection over union (IoU)
+            overlap = inter / (area[i] + area[ids[:last]] - inter)
+
+        # delete all boxes where overlap is too big
+        ids = np.delete(
+            ids,
+            np.concatenate([[last],
+                            np.where(overlap > overlap_threshold)[0]]))
+
+    return pick
+
+
+def convert_to_square(bboxes):
+    """Convert bounding boxes to a square form.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+
+    Returns:
+        a float numpy array of shape [n, 5],
+            squared bounding boxes.
+    """
+
+    square_bboxes = np.zeros_like(bboxes)
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    h = y2 - y1 + 1.0
+    w = x2 - x1 + 1.0
+    max_side = np.maximum(h, w)
+    square_bboxes[:, 0] = x1 + w * 0.5 - max_side * 0.5
+    square_bboxes[:, 1] = y1 + h * 0.5 - max_side * 0.5
+    square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0
+    square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0
+    return square_bboxes
+
+
+def calibrate_box(bboxes, offsets):
+    """Transform bounding boxes to be more like true bounding boxes.
+    'offsets' is one of the outputs of the nets.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+        offsets: a float numpy array of shape [n, 4].
+
+    Returns:
+        a float numpy array of shape [n, 5].
+    """
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w = x2 - x1 + 1.0
+    h = y2 - y1 + 1.0
+    w = np.expand_dims(w, 1)
+    h = np.expand_dims(h, 1)
+
+    # this is what happening here:
+    # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)]
+    # x1_true = x1 + tx1*w
+    # y1_true = y1 + ty1*h
+    # x2_true = x2 + tx2*w
+    # y2_true = y2 + ty2*h
+    # below is just more compact form of this
+
+    # are offsets always such that
+    # x1 < x2 and y1 < y2 ?
+
+    translation = np.hstack([w, h, w, h]) * offsets
+    bboxes[:, 0:4] = bboxes[:, 0:4] + translation
+    return bboxes
+
+
+def get_image_boxes(bounding_boxes, img, size=24):
+    """Cut out boxes from the image.
+
+    Arguments:
+        bounding_boxes: a float numpy array of shape [n, 5].
+        img: an instance of PIL.Image.
+        size: an integer, size of cutouts.
+
+    Returns:
+        a float numpy array of shape [n, 3, size, size].
+    """
+
+    num_boxes = len(bounding_boxes)
+    width, height = img.size
+
+    [dy, edy, dx, edx, y, ey, x, ex, w,
+     h] = correct_bboxes(bounding_boxes, width, height)
+    img_boxes = np.zeros((num_boxes, 3, size, size), 'float32')
+
+    for i in range(num_boxes):
+        img_box = np.zeros((h[i], w[i], 3), 'uint8')
+
+        img_array = np.asarray(img, 'uint8')
+        img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\
+            img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :]
+
+        # resize
+        img_box = Image.fromarray(img_box)
+        img_box = img_box.resize((size, size), Image.BILINEAR)
+        img_box = np.asarray(img_box, 'float32')
+
+        img_boxes[i, :, :, :] = _preprocess(img_box)
+
+    return img_boxes
+
+
+def correct_bboxes(bboxes, width, height):
+    """Crop boxes that are too big and get coordinates
+    with respect to cutouts.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        width: a float number.
+        height: a float number.
+
+    Returns:
+        dy, dx, edy, edx: a int numpy arrays of shape [n],
+            coordinates of the boxes with respect to the cutouts.
+        y, x, ey, ex: a int numpy arrays of shape [n],
+            corrected ymin, xmin, ymax, xmax.
+        h, w: a int numpy arrays of shape [n],
+            just heights and widths of boxes.
+
+        in the following order:
+            [dy, edy, dx, edx, y, ey, x, ex, w, h].
+    """
+
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w, h = x2 - x1 + 1.0, y2 - y1 + 1.0
+    num_boxes = bboxes.shape[0]
+
+    # 'e' stands for end
+    # (x, y) -> (ex, ey)
+    x, y, ex, ey = x1, y1, x2, y2
+
+    # we need to cut out a box from the image.
+    # (x, y, ex, ey) are corrected coordinates of the box
+    # in the image.
+    # (dx, dy, edx, edy) are coordinates of the box in the cutout
+    # from the image.
+    dx, dy = np.zeros((num_boxes, )), np.zeros((num_boxes, ))
+    edx, edy = w.copy() - 1.0, h.copy() - 1.0
+
+    # if box's bottom right corner is too far right
+    ind = np.where(ex > width - 1.0)[0]
+    edx[ind] = w[ind] + width - 2.0 - ex[ind]
+    ex[ind] = width - 1.0
+
+    # if box's bottom right corner is too low
+    ind = np.where(ey > height - 1.0)[0]
+    edy[ind] = h[ind] + height - 2.0 - ey[ind]
+    ey[ind] = height - 1.0
+
+    # if box's top left corner is too far left
+    ind = np.where(x < 0.0)[0]
+    dx[ind] = 0.0 - x[ind]
+    x[ind] = 0.0
+
+    # if box's top left corner is too high
+    ind = np.where(y < 0.0)[0]
+    dy[ind] = 0.0 - y[ind]
+    y[ind] = 0.0
+
+    return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h]
+    return_list = [i.astype('int32') for i in return_list]
+
+    return return_list
+
+
+def _preprocess(img):
+    """Preprocessing step before feeding the network.
+
+    Arguments:
+        img: a float numpy array of shape [h, w, c].
+
+    Returns:
+        a float numpy array of shape [1, c, h, w].
+    """
+    img = img.transpose((2, 0, 1))
+    img = np.expand_dims(img, 0)
+    img = (img - 127.5) * 0.0078125
+    return img
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/detector.py b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
new file mode 100644
index 00000000..9c3aca3a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
@@ -0,0 +1,149 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import os
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .box_utils import calibrate_box, convert_to_square, get_image_boxes, nms
+from .first_stage import run_first_stage
+from .get_nets import ONet, PNet, RNet
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mtcnn)
+class MtcnnFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+
+        self.pnet = PNet(model_path=os.path.join(self.model_path, 'pnet.npy'))
+        self.rnet = RNet(model_path=os.path.join(self.model_path, 'rnet.npy'))
+        self.onet = ONet(model_path=os.path.join(self.model_path, 'onet.npy'))
+
+        self.pnet = self.pnet.to(device)
+        self.rnet = self.rnet.to(device)
+        self.onet = self.onet.to(device)
+
+    def forward(self, input):
+        image = Image.fromarray(np.uint8(input['img'].cpu().numpy()))
+        pnet = self.pnet
+        rnet = self.rnet
+        onet = self.onet
+        onet.eval()
+
+        min_face_size = 20.0
+        thresholds = [0.7, 0.8, 0.9]
+        nms_thresholds = [0.7, 0.7, 0.7]
+
+        # BUILD AN IMAGE PYRAMID
+        width, height = image.size
+        min_length = min(height, width)
+
+        min_detection_size = 12
+        factor = 0.707  # sqrt(0.5)
+
+        # scales for scaling the image
+        scales = []
+
+        m = min_detection_size / min_face_size
+        min_length *= m
+
+        factor_count = 0
+        while min_length > min_detection_size:
+            scales.append(m * factor**factor_count)
+            min_length *= factor
+            factor_count += 1
+
+        # STAGE 1
+
+        # it will be returned
+        bounding_boxes = []
+
+        # run P-Net on different scales
+        for s in scales:
+            boxes = run_first_stage(
+                image,
+                pnet,
+                scale=s,
+                threshold=thresholds[0],
+                device=self.device)
+            bounding_boxes.append(boxes)
+
+        # collect boxes (and offsets, and scores) from different scales
+        bounding_boxes = [i for i in bounding_boxes if i is not None]
+        bounding_boxes = np.vstack(bounding_boxes)
+
+        keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0])
+        bounding_boxes = bounding_boxes[keep]
+
+        # use offsets predicted by pnet to transform bounding boxes
+        bounding_boxes = calibrate_box(bounding_boxes[:, 0:5],
+                                       bounding_boxes[:, 5:])
+        # shape [n_boxes, 5]
+
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 2
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=24)
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = rnet(img_boxes.to(self.device))
+        offsets = output[0].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[1].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[1])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+
+        keep = nms(bounding_boxes, nms_thresholds[1])
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes = calibrate_box(bounding_boxes, offsets[keep])
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 3
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=48)
+        if len(img_boxes) == 0:
+            return [], []
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = onet(img_boxes.to(self.device))
+        landmarks = output[0].cpu().data.numpy()  # shape [n_boxes, 10]
+        offsets = output[1].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[2].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[2])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+        landmarks = landmarks[keep]
+
+        # compute landmark points
+        width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0
+        height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0
+        xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1]
+        landmarks[:, 0:5] = np.expand_dims(
+            xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5]
+        landmarks[:, 5:10] = np.expand_dims(
+            ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10]
+
+        bounding_boxes = calibrate_box(bounding_boxes, offsets)
+        keep = nms(bounding_boxes, nms_thresholds[2], mode='min')
+        bounding_boxes = bounding_boxes[keep]
+        landmarks = landmarks[keep]
+        landmarks = landmarks.reshape(-1, 2, 5).transpose(
+            (0, 2, 1)).reshape(-1, 10)
+
+        return bounding_boxes, landmarks
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
new file mode 100644
index 00000000..e2aba47e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
@@ -0,0 +1,100 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import math
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.autograd import Variable
+
+from .box_utils import _preprocess, nms
+
+
+def run_first_stage(image, net, scale, threshold, device='cuda'):
+    """Run P-Net, generate bounding boxes, and do NMS.
+
+    Arguments:
+        image: an instance of PIL.Image.
+        net: an instance of pytorch's nn.Module, P-Net.
+        scale: a float number,
+            scale width and height of the image by this number.
+        threshold: a float number,
+            threshold on the probability of a face when generating
+            bounding boxes from predictions of the net.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9],
+            bounding boxes with scores and offsets (4 + 1 + 4).
+    """
+
+    # scale the image and convert it to a float array
+    width, height = image.size
+    sw, sh = math.ceil(width * scale), math.ceil(height * scale)
+    img = image.resize((sw, sh), Image.BILINEAR)
+    img = np.asarray(img, 'float32')
+
+    img = Variable(
+        torch.FloatTensor(_preprocess(img)), volatile=True).to(device)
+    output = net(img)
+    probs = output[1].cpu().data.numpy()[0, 1, :, :]
+    offsets = output[0].cpu().data.numpy()
+    # probs: probability of a face at each sliding window
+    # offsets: transformations to true bounding boxes
+
+    boxes = _generate_bboxes(probs, offsets, scale, threshold)
+    if len(boxes) == 0:
+        return None
+
+    keep = nms(boxes[:, 0:5], overlap_threshold=0.5)
+    return boxes[keep]
+
+
+def _generate_bboxes(probs, offsets, scale, threshold):
+    """Generate bounding boxes at places
+    where there is probably a face.
+
+    Arguments:
+        probs: a float numpy array of shape [n, m].
+        offsets: a float numpy array of shape [1, 4, n, m].
+        scale: a float number,
+            width and height of the image were scaled by this number.
+        threshold: a float number.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9]
+    """
+
+    # applying P-Net is equivalent, in some sense, to
+    # moving 12x12 window with stride 2
+    stride = 2
+    cell_size = 12
+
+    # indices of boxes where there is probably a face
+    inds = np.where(probs > threshold)
+
+    if inds[0].size == 0:
+        return np.array([])
+
+    # transformations of bounding boxes
+    tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)]
+    # they are defined as:
+    # w = x2 - x1 + 1
+    # h = y2 - y1 + 1
+    # x1_true = x1 + tx1*w
+    # x2_true = x2 + tx2*w
+    # y1_true = y1 + ty1*h
+    # y2_true = y2 + ty2*h
+
+    offsets = np.array([tx1, ty1, tx2, ty2])
+    score = probs[inds[0], inds[1]]
+
+    # P-Net is applied to scaled images
+    # so we need to rescale bounding boxes back
+    bounding_boxes = np.vstack([
+        np.round((stride * inds[1] + 1.0) / scale),
+        np.round((stride * inds[0] + 1.0) / scale),
+        np.round((stride * inds[1] + 1.0 + cell_size) / scale),
+        np.round((stride * inds[0] + 1.0 + cell_size) / scale), score, offsets
+    ])
+    # why one is added?
+
+    return bounding_boxes.T
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
new file mode 100644
index 00000000..5fbbd33b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
@@ -0,0 +1,160 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Flatten(nn.Module):
+
+    def __init__(self):
+        super(Flatten, self).__init__()
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, c, h, w].
+        Returns:
+            a float tensor with shape [batch_size, c*h*w].
+        """
+
+        # without this pretrained model isn't working
+        x = x.transpose(3, 2).contiguous()
+
+        return x.view(x.size(0), -1)
+
+
+class PNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(PNet, self).__init__()
+
+        # suppose we have input with size HxW, then
+        # after first layer: H - 2,
+        # after pool: ceil((H - 2)/2),
+        # after second conv: ceil((H - 2)/2) - 2,
+        # after last conv: ceil((H - 2)/2) - 4,
+        # and the same for W
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 10, 3, 1)),
+                         ('prelu1', nn.PReLU(10)),
+                         ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(10, 16, 3, 1)),
+                         ('prelu2', nn.PReLU(16)),
+                         ('conv3', nn.Conv2d(16, 32, 3, 1)),
+                         ('prelu3', nn.PReLU(32))]))
+
+        self.conv4_1 = nn.Conv2d(32, 2, 1, 1)
+        self.conv4_2 = nn.Conv2d(32, 4, 1, 1)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4, h', w'].
+            a: a float tensor with shape [batch_size, 2, h', w'].
+        """
+        x = self.features(x)
+        a = self.conv4_1(x)
+        b = self.conv4_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class RNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(RNet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 28, 3, 1)),
+                         ('prelu1', nn.PReLU(28)),
+                         ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(28, 48, 3, 1)),
+                         ('prelu2', nn.PReLU(48)),
+                         ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv3', nn.Conv2d(48, 64, 2, 1)),
+                         ('prelu3', nn.PReLU(64)), ('flatten', Flatten()),
+                         ('conv4', nn.Linear(576, 128)),
+                         ('prelu4', nn.PReLU(128))]))
+
+        self.conv5_1 = nn.Linear(128, 2)
+        self.conv5_2 = nn.Linear(128, 4)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv5_1(x)
+        b = self.conv5_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class ONet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(ONet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([
+                ('conv1', nn.Conv2d(3, 32, 3, 1)),
+                ('prelu1', nn.PReLU(32)),
+                ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv2', nn.Conv2d(32, 64, 3, 1)),
+                ('prelu2', nn.PReLU(64)),
+                ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv3', nn.Conv2d(64, 64, 3, 1)),
+                ('prelu3', nn.PReLU(64)),
+                ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                ('conv4', nn.Conv2d(64, 128, 2, 1)),
+                ('prelu4', nn.PReLU(128)),
+                ('flatten', Flatten()),
+                ('conv5', nn.Linear(1152, 256)),
+                ('drop5', nn.Dropout(0.25)),
+                ('prelu5', nn.PReLU(256)),
+            ]))
+
+        self.conv6_1 = nn.Linear(256, 2)
+        self.conv6_2 = nn.Linear(256, 4)
+        self.conv6_3 = nn.Linear(256, 10)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            c: a float tensor with shape [batch_size, 10].
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv6_1(x)
+        b = self.conv6_2(x)
+        c = self.conv6_3(x)
+        a = F.softmax(a)
+        return c, b, a
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 02682fa0..3eb5cd82 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -51,6 +51,7 @@ if TYPE_CHECKING:
     from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
+    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipeline
 
 else:
     _import_structure = {
@@ -114,7 +115,8 @@ else:
         'ulfd_face_detection_pipeline': ['UlfdFaceDetectionPipeline'],
         'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
-        ['FacialExpressionRecognitionPipeline']
+        ['FacialExpressionRecognitionPipeline'],
+        'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
new file mode 100644
index 00000000..57bf9920
--- /dev/null
+++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
@@ -0,0 +1,56 @@
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MtcnnFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mtcnn_face_detection)
+class MtcnnFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, './weights')
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        detector = MtcnnFaceDetector(model_path=ckpt_path, device=device)
+        self.detector = detector
+        self.device = device
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/tests/pipelines/test_mtcnn_face_detection.py b/tests/pipelines/test_mtcnn_face_detection.py
new file mode 100644
index 00000000..5afb5588
--- /dev/null
+++ b/tests/pipelines/test_mtcnn_face_detection.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+from PIL import Image
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class MtcnnFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_mtcnn'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mtcnn_face_detection.jpg'
+        img = Image.open(img_path)
+
+        result_1 = face_detection(img_path)
+        self.show_result(img_path, result_1)
+
+        result_2 = face_detection(img)
+        self.show_result(img_path, result_2)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8f05fa8cf18ee3e997f8be0c3dc34a31854fa3af Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 7 Sep 2022 09:35:59 +0800
Subject: [PATCH 073/175] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EMo?=
 =?UTF-8?q?gFace=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/Ma?=
 =?UTF-8?q?aS-lib/codereview/9921926?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/test/images/mog_face_detection.jpg       |   3 +
 modelscope/metainfo.py                        |   2 +
 .../models/cv/face_detection/__init__.py      |   5 +-
 .../cv/face_detection/mogface/__init__.py     |   1 +
 .../face_detection/mogface/models/__init__.py |   0
 .../mogface/models/detectors.py               |  96 ++++++++
 .../face_detection/mogface/models/mogface.py  | 135 +++++++++++
 .../mogface/models/mogprednet.py              | 164 ++++++++++++++
 .../face_detection/mogface/models/resnet.py   | 193 ++++++++++++++++
 .../cv/face_detection/mogface/models/utils.py | 212 ++++++++++++++++++
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../cv/mog_face_detection_pipeline.py         |  54 +++++
 tests/pipelines/test_mog_face_detection.py    |  33 +++
 13 files changed, 898 insertions(+), 2 deletions(-)
 create mode 100644 data/test/images/mog_face_detection.jpg
 create mode 100644 modelscope/models/cv/face_detection/mogface/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/mogface/models/__init__.py
 create mode 100644 modelscope/models/cv/face_detection/mogface/models/detectors.py
 create mode 100644 modelscope/models/cv/face_detection/mogface/models/mogface.py
 create mode 100644 modelscope/models/cv/face_detection/mogface/models/mogprednet.py
 create mode 100644 modelscope/models/cv/face_detection/mogface/models/resnet.py
 create mode 100755 modelscope/models/cv/face_detection/mogface/models/utils.py
 create mode 100644 modelscope/pipelines/cv/mog_face_detection_pipeline.py
 create mode 100644 tests/pipelines/test_mog_face_detection.py

diff --git a/data/test/images/mog_face_detection.jpg b/data/test/images/mog_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mog_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index d7594794..270c5aaf 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -35,6 +35,7 @@ class Models(object):
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
+    mogface = 'mogface'
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
 
@@ -128,6 +129,7 @@ class Pipelines(object):
     ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
+    mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
     mtcnn_face_detection = 'manual-face-detection-mtcnn'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index ed8832c2..a2a845d2 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -4,15 +4,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .mogface import MogFaceDetector
     from .mtcnn import MtcnnFaceDetector
     from .retinaface import RetinaFaceDetection
     from .ulfd_slim import UlfdFaceDetector
-
 else:
     _import_structure = {
         'ulfd_slim': ['UlfdFaceDetector'],
         'retinaface': ['RetinaFaceDetection'],
-        'mtcnn': ['MtcnnFaceDetector']
+        'mtcnn': ['MtcnnFaceDetector'],
+        'mogface': ['MogFaceDetector']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/mogface/__init__.py b/modelscope/models/cv/face_detection/mogface/__init__.py
new file mode 100644
index 00000000..8190b649
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/__init__.py
@@ -0,0 +1 @@
+from .models.detectors import MogFaceDetector
diff --git a/modelscope/models/cv/face_detection/mogface/models/__init__.py b/modelscope/models/cv/face_detection/mogface/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py
new file mode 100644
index 00000000..5ae67104
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -0,0 +1,96 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .mogface import MogFace
+from .utils import MogPriorBox, mogdecode, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mogface)
+class MogFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = MogFace()
+        self.load_model()
+        self.net = self.net.to(device)
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= np.array([[103.53, 116.28, 123.675]])
+        img /= np.array([[57.375, 57.120003, 58.395]])
+        img /= 255
+        img = img[:, :, ::-1].copy()
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        conf, loc = self.net(img)  # forward pass
+
+        confidence_threshold = 0.82
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = MogPriorBox(scale_list=[0.68])
+        priors = priorbox(im_height, im_width)
+        priors = torch.tensor(priors).to(self.device)
+        prior_data = priors.data
+
+        boxes = mogdecode(loc.data.squeeze(0), prior_data)
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 0]
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+
+        return dets / ss
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogface.py b/modelscope/models/cv/face_detection/mogface/models/mogface.py
new file mode 100644
index 00000000..294c2c6b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogface.py
@@ -0,0 +1,135 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .mogprednet import MogPredNet
+from .resnet import ResNet
+
+
+class MogFace(nn.Module):
+
+    def __init__(self):
+        super(MogFace, self).__init__()
+        self.backbone = ResNet(depth=101)
+        self.fpn = LFPN()
+        self.pred_net = MogPredNet()
+
+    def forward(self, x):
+        feature_list = self.backbone(x)
+        fpn_list = self.fpn(feature_list)
+        pyramid_feature_list = fpn_list[0]
+        conf, loc = self.pred_net(pyramid_feature_list)
+        return conf, loc
+
+
+class FeatureFusion(nn.Module):
+
+    def __init__(self, lat_ch=256, **channels):
+        super(FeatureFusion, self).__init__()
+        self.main_conv = nn.Conv2d(channels['main'], lat_ch, kernel_size=1)
+
+    def forward(self, up, main):
+        main = self.main_conv(main)
+        _, _, H, W = main.size()
+        res = F.upsample(up, scale_factor=2, mode='bilinear')
+        if res.size(2) != main.size(2) or res.size(3) != main.size(3):
+            res = res[:, :, 0:H, 0:W]
+        res = res + main
+        return res
+
+
+class LFPN(nn.Module):
+
+    def __init__(self,
+                 c2_out_ch=256,
+                 c3_out_ch=512,
+                 c4_out_ch=1024,
+                 c5_out_ch=2048,
+                 c6_mid_ch=512,
+                 c6_out_ch=512,
+                 c7_mid_ch=128,
+                 c7_out_ch=256,
+                 out_dsfd_ft=True):
+        super(LFPN, self).__init__()
+        self.out_dsfd_ft = out_dsfd_ft
+        if self.out_dsfd_ft:
+            dsfd_module = []
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(512, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(1024, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(2048, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            self.dsfd_modules = nn.ModuleList(dsfd_module)
+
+        c6_input_ch = c5_out_ch
+        self.c6 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_input_ch,
+                c6_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c6_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c6_mid_ch, c6_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c6_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+        self.c7 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_out_ch,
+                c7_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c7_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c7_mid_ch, c7_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c7_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+
+        self.p2_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p3_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p4_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+
+        self.c5_lat = nn.Conv2d(c6_input_ch, 256, kernel_size=3, padding=1)
+        self.c6_lat = nn.Conv2d(c6_out_ch, 256, kernel_size=3, padding=1)
+        self.c7_lat = nn.Conv2d(c7_out_ch, 256, kernel_size=3, padding=1)
+
+        self.ff_c5_c4 = FeatureFusion(main=c4_out_ch)
+        self.ff_c4_c3 = FeatureFusion(main=c3_out_ch)
+        self.ff_c3_c2 = FeatureFusion(main=c2_out_ch)
+
+    def forward(self, feature_list):
+        c2, c3, c4, c5 = feature_list
+        c6 = self.c6(c5)
+        c7 = self.c7(c6)
+
+        c5 = self.c5_lat(c5)
+        c6 = self.c6_lat(c6)
+        c7 = self.c7_lat(c7)
+
+        if self.out_dsfd_ft:
+            dsfd_fts = []
+            dsfd_fts.append(self.dsfd_modules[0](c2))
+            dsfd_fts.append(self.dsfd_modules[1](c3))
+            dsfd_fts.append(self.dsfd_modules[2](c4))
+            dsfd_fts.append(self.dsfd_modules[3](feature_list[-1]))
+            dsfd_fts.append(self.dsfd_modules[4](c6))
+            dsfd_fts.append(self.dsfd_modules[5](c7))
+
+        p4 = self.ff_c5_c4(c5, c4)
+        p3 = self.ff_c4_c3(p4, c3)
+        p2 = self.ff_c3_c2(p3, c2)
+
+        p2 = self.p2_lat(p2)
+        p3 = self.p3_lat(p3)
+        p4 = self.p4_lat(p4)
+
+        if self.out_dsfd_ft:
+            return ([p2, p3, p4, c5, c6, c7], dsfd_fts)
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogprednet.py b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
new file mode 100644
index 00000000..31384976
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class conv_bn(nn.Module):
+    """docstring for conv"""
+
+    def __init__(self, in_plane, out_plane, kernel_size, stride, padding):
+        super(conv_bn, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_plane,
+            out_plane,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        self.bn1 = nn.BatchNorm2d(out_plane)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        return self.bn1(x)
+
+
+class SSHContext(nn.Module):
+
+    def __init__(self, channels, Xchannels=256):
+        super(SSHContext, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            channels, Xchannels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            channels,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(
+            Xchannels // 2,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x1 = F.relu(self.conv1(x), inplace=True)
+        x2 = F.relu(self.conv2(x), inplace=True)
+        x2_1 = F.relu(self.conv2_1(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2_1(x2_2), inplace=True)
+
+        return torch.cat([x1, x2_1, x2_2], 1)
+
+
+class DeepHead(nn.Module):
+
+    def __init__(self,
+                 in_channel=256,
+                 out_channel=256,
+                 use_gn=False,
+                 num_conv=4):
+        super(DeepHead, self).__init__()
+        self.use_gn = use_gn
+        self.num_conv = num_conv
+        self.conv1 = nn.Conv2d(in_channel, out_channel, 3, 1, 1)
+        self.conv2 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv3 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv4 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        if self.use_gn:
+            self.gn1 = nn.GroupNorm(16, out_channel)
+            self.gn2 = nn.GroupNorm(16, out_channel)
+            self.gn3 = nn.GroupNorm(16, out_channel)
+            self.gn4 = nn.GroupNorm(16, out_channel)
+
+    def forward(self, x):
+        if self.use_gn:
+            x1 = F.relu(self.gn1(self.conv1(x)), inplace=True)
+            x2 = F.relu(self.gn2(self.conv1(x1)), inplace=True)
+            x3 = F.relu(self.gn3(self.conv1(x2)), inplace=True)
+            x4 = F.relu(self.gn4(self.conv1(x3)), inplace=True)
+        else:
+            x1 = F.relu(self.conv1(x), inplace=True)
+            x2 = F.relu(self.conv1(x1), inplace=True)
+            if self.num_conv == 2:
+                return x2
+            x3 = F.relu(self.conv1(x2), inplace=True)
+            x4 = F.relu(self.conv1(x3), inplace=True)
+
+        return x4
+
+
+class MogPredNet(nn.Module):
+
+    def __init__(self,
+                 num_anchor_per_pixel=1,
+                 num_classes=1,
+                 input_ch_list=[256, 256, 256, 256, 256, 256],
+                 use_deep_head=True,
+                 deep_head_with_gn=True,
+                 use_ssh=True,
+                 deep_head_ch=512):
+        super(MogPredNet, self).__init__()
+        self.num_classes = num_classes
+        self.use_deep_head = use_deep_head
+        self.deep_head_with_gn = deep_head_with_gn
+
+        self.use_ssh = use_ssh
+
+        self.deep_head_ch = deep_head_ch
+
+        if self.use_ssh:
+            self.conv_SSH = SSHContext(input_ch_list[0],
+                                       self.deep_head_ch // 2)
+
+        if self.use_deep_head:
+            if self.deep_head_with_gn:
+                self.deep_loc_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+                self.deep_cls_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+
+            self.pred_cls = nn.Conv2d(self.deep_head_ch,
+                                      1 * num_anchor_per_pixel, 3, 1, 1)
+            self.pred_loc = nn.Conv2d(self.deep_head_ch,
+                                      4 * num_anchor_per_pixel, 3, 1, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, pyramid_feature_list, dsfd_ft_list=None):
+        loc = []
+        conf = []
+
+        if self.use_deep_head:
+            for x in pyramid_feature_list:
+                if self.use_ssh:
+                    x = self.conv_SSH(x)
+                x_cls = self.deep_cls_head(x)
+                x_loc = self.deep_loc_head(x)
+
+                conf.append(
+                    self.pred_cls(x_cls).permute(0, 2, 3, 1).contiguous())
+                loc.append(
+                    self.pred_loc(x_loc).permute(0, 2, 3, 1).contiguous())
+
+        loc = torch.cat([o.view(o.size(0), -1, 4) for o in loc], 1)
+        conf = torch.cat(
+            [o.view(o.size(0), -1, self.num_classes) for o in conf], 1)
+        output = (
+            self.sigmoid(conf.view(conf.size(0), -1, self.num_classes)),
+            loc.view(loc.size(0), -1, 4),
+        )
+
+        return output
diff --git a/modelscope/models/cv/face_detection/mogface/models/resnet.py b/modelscope/models/cv/face_detection/mogface/models/resnet.py
new file mode 100644
index 00000000..045f6fa3
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py
@@ -0,0 +1,193 @@
+# The implementation is modified from original resent implementaiton, which is
+#  also open-sourced by the authors as Yang Liu,
+#  and is available publicly on  https://github.com/damo-cv/MogFace
+
+import torch.nn as nn
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 depth=50,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None,
+                 inplanes=64,
+                 shrink_ch_ratio=1):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        if depth == 50:
+            block = Bottleneck
+            layers = [3, 4, 6, 3]
+        elif depth == 101:
+            block = Bottleneck
+            layers = [3, 4, 23, 3]
+        elif depth == 152:
+            block = Bottleneck
+            layers = [3, 4, 36, 3]
+        elif depth == 18:
+            block = BasicBlock
+            layers = [2, 2, 2, 2]
+        else:
+            raise ValueError('only support depth in [18, 50, 101, 152]')
+
+        shrink_input_ch = int(inplanes * shrink_ch_ratio)
+        self.inplanes = int(inplanes * shrink_ch_ratio)
+        if shrink_ch_ratio == 0.125:
+            layers = [2, 3, 3, 3]
+
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, shrink_input_ch, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            shrink_input_ch * 2,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            shrink_input_ch * 4,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            shrink_input_ch * 8,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        four_conv_layer = []
+        x = self.layer1(x)
+        four_conv_layer.append(x)
+        x = self.layer2(x)
+        four_conv_layer.append(x)
+        x = self.layer3(x)
+        four_conv_layer.append(x)
+        x = self.layer4(x)
+        four_conv_layer.append(x)
+
+        return four_conv_layer
diff --git a/modelscope/models/cv/face_detection/mogface/models/utils.py b/modelscope/models/cv/face_detection/mogface/models/utils.py
new file mode 100755
index 00000000..377ceb3d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/utils.py
@@ -0,0 +1,212 @@
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+
+import math
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+def transform_anchor(anchors):
+    """
+    from [x0, x1, y0, y1] to [c_x, cy, w, h]
+    x1 = x0 + w - 1
+    c_x = (x0 + x1) / 2 = (2x0 + w - 1) / 2 = x0 + (w - 1) / 2
+    """
+    return np.concatenate(((anchors[:, :2] + anchors[:, 2:]) / 2,
+                           anchors[:, 2:] - anchors[:, :2] + 1),
+                          axis=1)
+
+
+def normalize_anchor(anchors):
+    """
+    from  [c_x, cy, w, h] to [x0, x1, y0, y1]
+    """
+    item_1 = anchors[:, :2] - (anchors[:, 2:] - 1) / 2
+    item_2 = anchors[:, :2] + (anchors[:, 2:] - 1) / 2
+    return np.concatenate((item_1, item_2), axis=1)
+
+
+class MogPriorBox(object):
+    """
+    both for fpn and single layer, single layer need to test
+    return (np.array) [num_anchros, 4] [x0, y0, x1, y1]
+    """
+
+    def __init__(self,
+                 scale_list=[1.],
+                 aspect_ratio_list=[1.0],
+                 stride_list=[4, 8, 16, 32, 64, 128],
+                 anchor_size_list=[16, 32, 64, 128, 256, 512]):
+        self.scale_list = scale_list
+        self.aspect_ratio_list = aspect_ratio_list
+        self.stride_list = stride_list
+        self.anchor_size_list = anchor_size_list
+
+    def __call__(self, img_height, img_width):
+        final_anchor_list = []
+
+        for idx, stride in enumerate(self.stride_list):
+            anchor_list = []
+            cur_img_height = img_height
+            cur_img_width = img_width
+            tmp_stride = stride
+
+            while tmp_stride != 1:
+                tmp_stride = tmp_stride // 2
+                cur_img_height = (cur_img_height + 1) // 2
+                cur_img_width = (cur_img_width + 1) // 2
+
+            for i in range(cur_img_height):
+                for j in range(cur_img_width):
+                    for scale in self.scale_list:
+                        cx = (j + 0.5) * stride
+                        cy = (i + 0.5) * stride
+                        side_x = self.anchor_size_list[idx] * scale
+                        side_y = self.anchor_size_list[idx] * scale
+                        for ratio in self.aspect_ratio_list:
+                            anchor_list.append([
+                                cx, cy, side_x / math.sqrt(ratio),
+                                side_y * math.sqrt(ratio)
+                            ])
+
+            final_anchor_list.append(anchor_list)
+        final_anchor_arr = np.concatenate(final_anchor_list, axis=0)
+        normalized_anchor_arr = normalize_anchor(final_anchor_arr).astype(
+            'float32')
+        transformed_anchor = transform_anchor(normalized_anchor_arr)
+
+        return transformed_anchor
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def mogdecode(loc, anchors):
+    """
+    loc: torch.Tensor
+    anchors: 2-d, torch.Tensor (cx, cy, w, h)
+    boxes: 2-d, torch.Tensor (x0, y0, x1, y1)
+    """
+
+    boxes = torch.cat((anchors[:, :2] + loc[:, :2] * anchors[:, 2:],
+                       anchors[:, 2:] * torch.exp(loc[:, 2:])), 1)
+
+    boxes[:, 0] -= (boxes[:, 2] - 1) / 2
+    boxes[:, 1] -= (boxes[:, 3] - 1) / 2
+    boxes[:, 2] += boxes[:, 0] - 1
+    boxes[:, 3] += boxes[:, 1] - 1
+
+    return boxes
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 3eb5cd82..a9dc05f2 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -48,6 +48,7 @@ if TYPE_CHECKING:
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .mog_face_detection_pipeline import MogFaceDetectionPipeline
     from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
@@ -112,6 +113,7 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'mog_face_detection_pipeline': ['MogFaceDetectionPipeline'],
         'ulfd_face_detection_pipeline': ['UlfdFaceDetectionPipeline'],
         'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
new file mode 100644
index 00000000..8797ad12
--- /dev/null
+++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
@@ -0,0 +1,54 @@
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MogFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mog_face_detection)
+class MogFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = MogFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[:, :4].tolist()
+        scores = result[:, 4].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/tests/pipelines/test_mog_face_detection.py b/tests/pipelines/test_mog_face_detection.py
new file mode 100644
index 00000000..5c6d97c2
--- /dev/null
+++ b/tests/pipelines/test_mog_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class MogFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet101_face-detection_cvpr22papermogface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mog_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f96c4a5ea2f696c812f05868d4103c2d5b5ffcea Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 7 Sep 2022 11:12:18 +0800
Subject: [PATCH 074/175] [to #44742129] skip commit result when MODEL_TAG_URL
 is empty str

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10045468
---
 modelscope/utils/model_tag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/utils/model_tag.py b/modelscope/utils/model_tag.py
index 380ddccb..9c494eac 100644
--- a/modelscope/utils/model_tag.py
+++ b/modelscope/utils/model_tag.py
@@ -159,7 +159,7 @@ class ModelTag(object):
     """
 
     def commit_ut_result(self):
-        if self._URL is not None:
+        if self._URL is not None and self._URL != '':
             self.job_name = 'UT'
             self.source = 'dev'
             self.stage = 'integration'

From 7ed4015bdcb60a4580fa8605cc9bc49a60b28e7f Mon Sep 17 00:00:00 2001
From: "dingkun.ldk" <dingkun.ldk@alibaba-inc.com>
Date: Wed, 7 Sep 2022 11:57:30 +0800
Subject: [PATCH 075/175] =?UTF-8?q?[to=20#42322933]=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E8=AF=8D=E6=80=A7=E6=A0=87=E6=B3=A8=20=20=20=20=20=20=20=20=20?=
 =?UTF-8?q?Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coderevi?=
 =?UTF-8?q?ew/9980774?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |   7 ++
 modelscope/models/nlp/__init__.py             |  43 ++++---
 .../nlp/heads/sequence_classification_head.py |   1 -
 .../nlp/heads/token_classification_head.py    |  42 +++++++
 .../nlp/structbert/configuration_sbert.py     |   2 +-
 modelscope/models/nlp/task_models/__init__.py |   2 +
 .../nlp/task_models/token_classification.py   |  83 +++++++++++++
 modelscope/models/nlp/token_classification.py |   1 +
 modelscope/outputs.py                         |  30 ++---
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/nlp/__init__.py          |  34 +++---
 .../nlp/token_classification_pipeline.py      |  92 +++++++++++++++
 modelscope/preprocessors/__init__.py          |  19 ++-
 modelscope/preprocessors/nlp.py               | 110 +++++++++++++++++-
 modelscope/utils/hub.py                       |  30 ++++-
 tests/pipelines/test_part_of_speech.py        |  55 +++++++++
 16 files changed, 477 insertions(+), 77 deletions(-)
 create mode 100644 modelscope/models/nlp/heads/token_classification_head.py
 create mode 100644 modelscope/models/nlp/task_models/token_classification.py
 create mode 100644 modelscope/pipelines/nlp/token_classification_pipeline.py
 create mode 100644 tests/pipelines/test_part_of_speech.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 270c5aaf..994095c3 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -55,7 +55,9 @@ class Models(object):
     space_modeling = 'space-modeling'
     star = 'star'
     tcrf = 'transformer-crf'
+    transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
+    gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
     plug = 'plug'
@@ -82,6 +84,7 @@ class Models(object):
 class TaskModels(object):
     # nlp task
     text_classification = 'text-classification'
+    token_classification = 'token-classification'
     information_extraction = 'information-extraction'
 
 
@@ -92,6 +95,8 @@ class Heads(object):
     bert_mlm = 'bert-mlm'
     # roberta mlm
     roberta_mlm = 'roberta-mlm'
+    # token cls
+    token_classification = 'token-classification'
     information_extraction = 'information-extraction'
 
 
@@ -167,6 +172,7 @@ class Pipelines(object):
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
     word_segmentation = 'word-segmentation'
+    part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
     text_generation = 'text-generation'
     sentiment_analysis = 'sentiment-analysis'
@@ -272,6 +278,7 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 9d54834c..40be8665 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -5,40 +5,39 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .backbones import SbertModel
-    from .heads import SequenceClassificationHead
+    from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
-    from .masked_language import (
-        StructBertForMaskedLM,
-        VecoForMaskedLM,
-        BertForMaskedLM,
-        DebertaV2ForMaskedLM,
-    )
+    from .heads import SequenceClassificationHead
+    from .gpt3 import GPT3ForTextGeneration
+    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
+                                  BertForMaskedLM, DebertaV2ForMaskedLM)
     from .nncrf_for_named_entity_recognition import (
         TransformerCRFForNamedEntityRecognition,
         LSTMCRFForNamedEntityRecognition)
-    from .token_classification import SbertForTokenClassification
+    from .palm_v2 import PalmForTextGeneration
+    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
+    from .star_text_to_sql import StarForTextToSql
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
-    from .star_text_to_sql import StarForTextToSql
     from .task_models import (InformationExtractionModel,
-                              SingleBackboneTaskModelBase)
-    from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .gpt3 import GPT3ForTextGeneration
-    from .plug import PlugForTextGeneration
-    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
+                              SequenceClassificationModel,
+                              SingleBackboneTaskModelBase,
+                              TokenClassificationModel)
+    from .token_classification import SbertForTokenClassification
 
 else:
     _import_structure = {
-        'star_text_to_sql': ['StarForTextToSql'],
         'backbones': ['SbertModel'],
-        'heads': ['SequenceClassificationHead'],
-        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
+        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'heads': ['SequenceClassificationHead'],
+        'gpt3': ['GPT3ForTextGeneration'],
         'masked_language': [
             'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
             'DebertaV2ForMaskedLM'
@@ -48,7 +47,8 @@ else:
             'LSTMCRFForNamedEntityRecognition'
         ],
         'palm_v2': ['PalmForTextGeneration'],
-        'token_classification': ['SbertForTokenClassification'],
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'star_text_to_sql': ['StarForTextToSql'],
         'sequence_classification':
         ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
         'space': [
@@ -57,12 +57,9 @@ else:
         ],
         'task_models': [
             'InformationExtractionModel', 'SequenceClassificationModel',
-            'SingleBackboneTaskModelBase'
+            'SingleBackboneTaskModelBase', 'TokenClassificationModel'
         ],
-        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'gpt3': ['GPT3ForTextGeneration'],
-        'plug': ['PlugForTextGeneration'],
-        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'token_classification': ['SbertForTokenClassification'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index 92f3a4ec..e608f035 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -19,7 +19,6 @@ class SequenceClassificationHead(TorchHead):
         super().__init__(**kwargs)
         config = self.config
         self.num_labels = config.num_labels
-        self.config = config
         classifier_dropout = (
             config['classifier_dropout'] if config.get('classifier_dropout')
             is not None else config['hidden_dropout_prob'])
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
new file mode 100644
index 00000000..481524ae
--- /dev/null
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -0,0 +1,42 @@
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.token_classification, module_name=Heads.token_classification)
+class TokenClassificationHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        self.num_labels = config.num_labels
+        classifier_dropout = (
+            config['classifier_dropout'] if config.get('classifier_dropout')
+            is not None else config['hidden_dropout_prob'])
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config['hidden_size'],
+                                    config['num_labels'])
+
+    def forward(self, inputs=None):
+        if isinstance(inputs, dict):
+            assert inputs.get('sequence_output') is not None
+            sequence_output = inputs.get('sequence_output')
+        else:
+            sequence_output = inputs
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return {OutputKeys.LOGITS: logits}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        logits = outputs[OutputKeys.LOGITS]
+        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py
index 374d4b62..a727a978 100644
--- a/modelscope/models/nlp/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration_sbert.py
@@ -85,7 +85,7 @@ class SbertConfig(PretrainedConfig):
             If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
     """
 
-    model_type = 'sbert'
+    model_type = 'structbert'
 
     def __init__(self,
                  vocab_size=30522,
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 49cf0ee4..7493ba74 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -7,12 +7,14 @@ if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
+    from .token_classification import TokenClassificationModel
 
 else:
     _import_structure = {
         'information_extraction': ['InformationExtractionModel'],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
+        'token_classification': ['TokenClassificationModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
new file mode 100644
index 00000000..29679838
--- /dev/null
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+
+__all__ = ['TokenClassificationModel']
+
+
+@MODELS.register_module(
+    Tasks.token_classification, module_name=TaskModels.token_classification)
+class TokenClassificationModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the token classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        head_cfg['num_labels'] = num_labels
+
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        labels = None
+        if OutputKeys.LABEL in input:
+            labels = input.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in input:
+            labels = input.pop(OutputKeys.LABELS)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+        if labels in input:
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        return outputs
+
+    def extract_logits(self, outputs):
+        return outputs[OutputKeys.LOGITS].cpu().detach()
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        return sequence_output, pooled_output
+
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def postprocess(self, input, **kwargs):
+        logits = self.extract_logits(input)
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = torch_nested_numpify(torch_nested_detach(pred))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
+        return res
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index 59d7d0cf..0be921d0 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -91,6 +91,7 @@ class TokenClassification(TorchModel):
 
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
 @MODELS.register_module(
     Tasks.token_classification, module_name=Models.structbert)
 class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c6a7a619..6c7500bb 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -359,26 +359,20 @@ TASK_OUTPUTS = {
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
-    # }
-    Tasks.word_segmentation: [OutputKeys.OUTPUT],
-
-    # part-of-speech result for single sample
-    # [
-    #     {'word': '诸葛', 'label': 'PROPN'},
-    #     {'word': '亮', 'label': 'PROPN'},
-    #     {'word': '发明', 'label': 'VERB'},
-    #     {'word': '八', 'label': 'NUM'},
-    #     {'word': '阵', 'label': 'NOUN'},
-    #     {'word': '图', 'label': 'PART'},
-    #     {'word': '以', 'label': 'ADV'},
-    #     {'word': '利', 'label': 'VERB'},
-    #     {'word': '立营', 'label': 'VERB'},
-    #     {'word': '练兵', 'label': 'VERB'},
-    #     {'word': '.', 'label': 'PUNCT'}
+    #   "labels": [
+    #     {'word': '今天', 'label': 'PROPN'},
+    #     {'word': '天气', 'label': 'PROPN'},
+    #     {'word': '不错', 'label': 'VERB'},
+    #     {'word': ',', 'label': 'NUM'},
+    #     {'word': '适合', 'label': 'NOUN'},
+    #     {'word': '出去', 'label': 'PART'},
+    #     {'word': '游玩', 'label': 'ADV'},
     # ]
-    # TODO @wenmeng.zwm support list of result check
-    Tasks.part_of_speech: [OutputKeys.WORD, OutputKeys.LABEL],
+    # }
+    Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
+    Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
 
+    # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
     # {
     #   "output": [
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 9f265fb8..fa79ca11 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -20,6 +20,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.token_classification:
+    (Pipelines.part_of_speech,
+     'damo/nlp_structbert_part-of-speech_chinese-base'),
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 665e016d..9baeefbb 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -9,21 +9,21 @@ if TYPE_CHECKING:
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
+    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
     from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
-    from .text_generation_pipeline import TextGenerationPipeline
-    from .translation_pipeline import TranslationPipeline
-    from .word_segmentation_pipeline import WordSegmentationPipeline
-    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
-    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
-    from .relation_extraction_pipeline import RelationExtractionPipeline
+    from .text_generation_pipeline import TextGenerationPipeline
+    from .token_classification_pipeline import TokenClassificationPipeline
+    from .translation_pipeline import TranslationPipeline
+    from .word_segmentation_pipeline import WordSegmentationPipeline
+    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
 
 else:
     _import_structure = {
@@ -34,25 +34,25 @@ else:
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
+        'named_entity_recognition_pipeline':
+        ['NamedEntityRecognitionPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
-        'single_sentence_classification_pipeline':
-        ['SingleSentenceClassificationPipeline'],
         'pair_sentence_classification_pipeline':
         ['PairSentenceClassificationPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
-        'text_generation_pipeline': ['TextGenerationPipeline'],
-        'word_segmentation_pipeline': ['WordSegmentationPipeline'],
-        'zero_shot_classification_pipeline':
-        ['ZeroShotClassificationPipeline'],
-        'named_entity_recognition_pipeline':
-        ['NamedEntityRecognitionPipeline'],
-        'translation_pipeline': ['TranslationPipeline'],
+        'single_sentence_classification_pipeline':
+        ['SingleSentenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
-        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
-        'relation_extraction_pipeline': ['RelationExtractionPipeline']
+        'text_generation_pipeline': ['TextGenerationPipeline'],
+        'token_classification_pipeline': ['TokenClassificationPipeline'],
+        'translation_pipeline': ['TranslationPipeline'],
+        'word_segmentation_pipeline': ['WordSegmentationPipeline'],
+        'zero_shot_classification_pipeline':
+        ['ZeroShotClassificationPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
new file mode 100644
index 00000000..804f8146
--- /dev/null
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -0,0 +1,92 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TokenClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.part_of_speech)
+class TokenClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a token classification pipeline for prediction
+
+        Args:
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
+        """
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TokenClassificationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = getattr(model, 'id2label')
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
+        with torch.no_grad():
+            return {
+                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+            }
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        pred_list = inputs['predictions']
+        labels = []
+        for pre in pred_list:
+            labels.append(self.id2label[pre])
+        labels = labels[1:-1]
+        chunks = []
+        tags = []
+        chunk = ''
+        assert len(inputs['text']) == len(labels)
+        for token, label in zip(inputs['text'], labels):
+            if label[0] == 'B' or label[0] == 'I':
+                chunk += token
+            else:
+                chunk += token
+                chunks.append(chunk)
+                chunk = ''
+                tags.append(label.split('-')[-1])
+        if chunk:
+            chunks.append(chunk)
+            tags.append(label.split('-')[-1])
+        pos_result = []
+        seg_result = ' '.join(chunks)
+        for chunk, tag in zip(chunks, tags):
+            pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag})
+        outputs = {
+            OutputKeys.OUTPUT: seg_result,
+            OutputKeys.LABELS: pos_result
+        }
+        return outputs
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 9f7d595e..0123b32e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -15,15 +15,14 @@ if TYPE_CHECKING:
                         ImageDenoisePreprocessor)
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
-    from .nlp import (Tokenize, SequenceClassificationPreprocessor,
-                      TextGenerationPreprocessor,
-                      TokenClassificationPreprocessor,
-                      SingleSentenceClassificationPreprocessor,
-                      PairSentenceClassificationPreprocessor,
-                      FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
-                      NERPreprocessor, TextErrorCorrectionPreprocessor,
-                      FaqQuestionAnsweringPreprocessor,
-                      RelationExtractionPreprocessor)
+    from .nlp import (
+        Tokenize, SequenceClassificationPreprocessor,
+        TextGenerationPreprocessor, TokenClassificationPreprocessor,
+        SingleSentenceClassificationPreprocessor,
+        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
+        ZeroShotClassificationPreprocessor, NERPreprocessor,
+        TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
+        SequenceLabelingPreprocessor, RelationExtractionPreprocessor)
     from .slp import DocumentSegmentationPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -52,7 +51,7 @@ else:
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
             'TextErrorCorrectionPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
+            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor'
         ],
         'slp': ['DocumentSegmentationPreprocessor'],
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index cfb8c9e8..aaa83ed1 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,9 +5,11 @@ import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
+import torch
 from transformers import AutoTokenizer, BertTokenizerFast
 
 from modelscope.metainfo import Models, Preprocessors
+from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
 from modelscope.utils.config import ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys
@@ -23,7 +25,7 @@ __all__ = [
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
-    'RelationExtractionPreprocessor'
+    'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor'
 ]
 
 
@@ -627,6 +629,112 @@ class NERPreprocessor(Preprocessor):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
+class SequenceLabelingPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in normal NER task.
+
+    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+
+        if 'lstm' in model_dir or 'gcnn' in model_dir:
+            self.tokenizer = BertTokenizerFast.from_pretrained(
+                model_dir, use_fast=False)
+        elif 'structbert' in model_dir:
+            self.tokenizer = SbertTokenizerFast.from_pretrained(
+                model_dir, use_fast=False)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_dir, use_fast=False)
+        self.is_split_into_words = self.tokenizer.init_kwargs.get(
+            'is_split_into_words', False)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        if self.is_split_into_words:
+            input_ids = []
+            label_mask = []
+            offset_mapping = []
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)]
+                                      + [(offset + 1, offset + 1)]
+                                      * (len(subtoken_ids) - 1))
+            if len(input_ids) >= self.sequence_length - 2:
+                input_ids = input_ids[:self.sequence_length - 2]
+                label_mask = label_mask[:self.sequence_length - 2]
+                offset_mapping = offset_mapping[:self.sequence_length - 2]
+            input_ids = [self.tokenizer.cls_token_id
+                         ] + input_ids + [self.tokenizer.sep_token_id]
+            label_mask = [0] + label_mask + [0]
+            attention_mask = [1] * len(input_ids)
+        else:
+            encodings = self.tokenizer(
+                text,
+                add_special_tokens=True,
+                padding=True,
+                truncation=True,
+                max_length=self.sequence_length,
+                return_offsets_mapping=True)
+            input_ids = encodings['input_ids']
+            attention_mask = encodings['attention_mask']
+            word_ids = encodings.word_ids()
+            label_mask = []
+            offset_mapping = []
+            for i in range(len(word_ids)):
+                if word_ids[i] is None:
+                    label_mask.append(0)
+                elif word_ids[i] == word_ids[i - 1]:
+                    label_mask.append(0)
+                    offset_mapping[-1] = (offset_mapping[-1][0],
+                                          encodings['offset_mapping'][i][1])
+                else:
+                    label_mask.append(1)
+                    offset_mapping.append(encodings['offset_mapping'][i])
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+        return {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.re_tokenizer)
 class RelationExtractionPreprocessor(Preprocessor):
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index f79097fe..cf114b5e 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -77,19 +77,26 @@ def auto_load(model: Union[str, List[str]]):
 def get_model_type(model_dir):
     """Get the model type from the configuration.
 
-    This method will try to get the 'model.type' or 'model.model_type' field from the configuration.json file.
-    If this file does not exist, the method will try to get the 'model_type' field from the config.json.
+    This method will try to get the model type from 'model.backbone.type',
+    'model.type' or 'model.model_type' field in the configuration.json file. If
+    this file does not exist, the method will try to get the 'model_type' field
+    from the config.json.
 
-    @param model_dir: The local model dir to use.
-    @return: The model type string, returns None if nothing is found.
+    @param model_dir: The local model dir to use. @return: The model type
+    string, returns None if nothing is found.
     """
     try:
         configuration_file = osp.join(model_dir, ModelFile.CONFIGURATION)
         config_file = osp.join(model_dir, 'config.json')
         if osp.isfile(configuration_file):
             cfg = Config.from_file(configuration_file)
-            return cfg.model.model_type if hasattr(cfg.model, 'model_type') and not hasattr(cfg.model, 'type') \
-                else cfg.model.type
+            if hasattr(cfg.model, 'backbone'):
+                return cfg.model.backbone.type
+            elif hasattr(cfg.model,
+                         'model_type') and not hasattr(cfg.model, 'type'):
+                return cfg.model.model_type
+            else:
+                return cfg.model.type
         elif osp.isfile(config_file):
             cfg = Config.from_file(config_file)
             return cfg.model_type if hasattr(cfg, 'model_type') else None
@@ -123,13 +130,24 @@ def parse_label_mapping(model_dir):
         if hasattr(config, ConfigFields.model) and hasattr(
                 config[ConfigFields.model], 'label2id'):
             label2id = config[ConfigFields.model].label2id
+        elif hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'id2label'):
+            id2label = config[ConfigFields.model].id2label
+            label2id = {label: id for id, label in id2label.items()}
         elif hasattr(config, ConfigFields.preprocessor) and hasattr(
                 config[ConfigFields.preprocessor], 'label2id'):
             label2id = config[ConfigFields.preprocessor].label2id
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'id2label'):
+            id2label = config[ConfigFields.preprocessor].id2label
+            label2id = {label: id for id, label in id2label.items()}
 
     if label2id is None:
         config_path = os.path.join(model_dir, 'config.json')
         config = Config.from_file(config_path)
         if hasattr(config, 'label2id'):
             label2id = config.label2id
+        elif hasattr(config, 'id2label'):
+            id2label = config.id2label
+            label2id = {label: id for id, label in id2label.items()}
     return label2id
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
new file mode 100644
index 00000000..25f4491c
--- /dev/null
+++ b/tests/pipelines/test_part_of_speech.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import TokenClassificationModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TokenClassificationPipeline
+from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PartOfSpeechTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_part-of-speech_chinese-base'
+    sentence = '今天天气不错，适合出去游玩'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
+        model = TokenClassificationModel.from_pretrained(cache_path)
+        pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.token_classification, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.token_classification)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()

From a8fd9c4afea7fbe0a8d478587911702f0d470a53 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Wed, 7 Sep 2022 20:12:44 +0800
Subject: [PATCH 076/175] [to #42322933] add new pipeline - PoNet for fill-mask
         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10019083

---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/ponet/__init__.py       |   41 +
 .../models/nlp/ponet/configuration_ponet.py   |  117 ++
 modelscope/models/nlp/ponet/modeling_ponet.py | 1591 +++++++++++++++++
 .../models/nlp/ponet/tokenization_ponet.py    |  155 ++
 .../models/nlp/ponet_for_masked_language.py   |   53 +
 modelscope/pipelines/nlp/__init__.py          |    2 +
 .../pipelines/nlp/fill_mask_ponet_pipeline.py |  136 ++
 modelscope/preprocessors/__init__.py          |    8 +-
 modelscope/preprocessors/nlp.py               |  306 +++-
 modelscope/preprocessors/slp.py               |  223 ---
 modelscope/utils/nlp/nlp_utils.py             |   20 +
 tests/pipelines/test_fill_mask_ponet.py       |   48 +
 14 files changed, 2475 insertions(+), 230 deletions(-)
 create mode 100644 modelscope/models/nlp/ponet/__init__.py
 create mode 100644 modelscope/models/nlp/ponet/configuration_ponet.py
 create mode 100644 modelscope/models/nlp/ponet/modeling_ponet.py
 create mode 100644 modelscope/models/nlp/ponet/tokenization_ponet.py
 create mode 100644 modelscope/models/nlp/ponet_for_masked_language.py
 create mode 100644 modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
 delete mode 100644 modelscope/preprocessors/slp.py
 create mode 100644 tests/pipelines/test_fill_mask_ponet.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 994095c3..f904b5df 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -62,6 +62,7 @@ class Models(object):
     gpt3 = 'gpt3'
     plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
+    ponet = 'ponet'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -179,6 +180,7 @@ class Pipelines(object):
     sentiment_classification = 'sentiment-classification'
     text_classification = 'text-classification'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
     csanmt_translation = 'csanmt-translation'
     nli = 'nli'
     dialog_intent_prediction = 'dialog-intent-prediction'
@@ -281,6 +283,7 @@ class Preprocessors(object):
     sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
     re_tokenizer = 're-tokenizer'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 40be8665..a3a12c22 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
     from .gpt3 import GPT3ForTextGeneration
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM, DebertaV2ForMaskedLM)
+    from .ponet_for_masked_language import PoNetForMaskedLM
     from .nncrf_for_named_entity_recognition import (
         TransformerCRFForNamedEntityRecognition,
         LSTMCRFForNamedEntityRecognition)
@@ -46,6 +47,7 @@ else:
             'TransformerCRFForNamedEntityRecognition',
             'LSTMCRFForNamedEntityRecognition'
         ],
+        'ponet_for_masked_language': ['PoNetForMaskedLM'],
         'palm_v2': ['PalmForTextGeneration'],
         'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
         'star_text_to_sql': ['StarForTextToSql'],
diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py
new file mode 100644
index 00000000..6d26b194
--- /dev/null
+++ b/modelscope/models/nlp/ponet/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_ponet import PoNetConfig
+    from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
+                                 PoNetPreTrainedModel)
+    from .tokenization_ponet import PoNetTokenizer
+else:
+    _import_structure = {
+        'configuration_ponet': ['PoNetConfig'],
+        'modeling_ponet':
+        ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
+        'tokenization_ponet': ['PoNetTokenizer'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration_ponet.py
new file mode 100644
index 00000000..70294fc2
--- /dev/null
+++ b/modelscope/models/nlp/ponet/configuration_ponet.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PoNet model configuration, mainly copied from :class:`~transformers.BertConfig` """
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PoNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.ponet.PoNetModel`.
+    It is used to instantiate a PoNet model according to the specified arguments.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+        clsgsepg (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not use a trick to make sure the segment and local information will not leak.
+    """
+    model_type = 'ponet'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 clsgsepg=True,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.clsgsepg = clsgsepg
diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/modeling_ponet.py
new file mode 100644
index 00000000..f37954db
--- /dev/null
+++ b/modelscope/models/nlp/ponet/modeling_ponet.py
@@ -0,0 +1,1591 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PoNet model. """
+
+import math
+from dataclasses import dataclass
+from distutils.version import LooseVersion
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.models.bert.modeling_bert import \
+    load_tf_weights_in_bert as load_tf_weights_in_ponet
+
+from modelscope.utils.logger import get_logger
+from .configuration_ponet import PoNetConfig
+
+logger = get_logger(__name__)
+
+is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
+
+_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
+_CONFIG_FOR_DOC = 'PoNetConfig'
+_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
+
+CLS_ID = 101
+EOS_ID = 102
+
+
+def segment_max(src, index, dim=1):
+    if is_pytorch_12plus:
+        out = torch.zeros_like(src).scatter_reduce(
+            dim,
+            index[:, :, None].expand_as(src),
+            src,
+            reduce='amax',
+            include_self=False)
+    else:
+        dummy_scatter_index = index[:, :, None].expand_as(src)
+        min_value = src.min() - 1
+        dummpy_scatter_shape = (*src.shape[:-1], index.max() + 1,
+                                src.shape[-1])
+        dummy_scatter_index_expand = dummy_scatter_index.unsqueeze(-2).expand(
+            *dummpy_scatter_shape)
+        index_reconstruct_expand = torch.arange(
+            index.max() + 1,
+            device=src.device)[None, None, :,
+                               None].expand(*dummpy_scatter_shape)
+        src_expand = src.unsqueeze(-2).expand(*dummpy_scatter_shape)
+        out, _ = src_expand.masked_scatter(
+            dummy_scatter_index_expand != index_reconstruct_expand,
+            torch.full_like(src_expand, min_value.item())).max(dim=1)
+
+    dummy = index.unsqueeze(-1).expand(*index.shape[:2], out.size(-1))
+    return torch.gather(out, dim, dummy).to(dtype=src.dtype)
+
+
+def get_segment_index(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id).to(
+        dtype=torch.long) + (input_ids == eos_id).to(dtype=torch.long)
+    mask = mask + torch.cat([torch.zeros_like(mask[:, 0:1]), mask[:, :-1]],
+                            dim=1)
+    return mask.cumsum(dim=1) - 1
+
+
+def get_token_type_mask(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id) | (input_ids == eos_id)
+    return mask
+
+
+def get_win_max(hidden_states, kernel_size=3):
+    m = nn.MaxPool1d(kernel_size, stride=1, padding=kernel_size // 2)
+    out = m(hidden_states.permute(0, 2, 1)).permute(0, 2, 1)
+    return out
+
+
+class PoNetEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PoNetSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.dense_local = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_segment = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.num_attention_heads = config.num_attention_heads
+        self.clsgsepg = getattr(config, 'clsgsepg', True)
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.dense_q = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_k = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_o = nn.Linear(config.hidden_size, self.all_head_size)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # bz, head, len, head_size
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        context_layer_q = self.transpose_for_scores(
+            self.dense_q(hidden_states))
+        context_layer_k = self.transpose_for_scores(
+            self.dense_k(hidden_states))
+        context_layer_v = context_layer_k
+        context_layer_o = self.transpose_for_scores(
+            self.dense_o(hidden_states))
+
+        if attention_mask is not None:
+            _attention_mask = (attention_mask.squeeze(1).unsqueeze(-1) < -1)
+
+        if attention_mask is not None:
+            context_layer_q.masked_fill_(_attention_mask, 0.0)
+            q = context_layer_q.sum(dim=-2) / torch.ones_like(
+                _attention_mask).to(dtype=context_layer_q.dtype).masked_fill(
+                    _attention_mask, 0.0).sum(dim=-2)
+        else:
+            q = context_layer_q.mean(dim=-2)
+        att = torch.einsum('bdh,bdlh -> bdl', q, context_layer_k) / math.sqrt(
+            context_layer_q.shape[-1])
+        if attention_mask is not None:
+            att = att + attention_mask.squeeze(1)
+        att_prob = att.softmax(dim=-1)
+        v = torch.einsum('bdlh,bdl->bdh', context_layer_v, att_prob)
+
+        context_layer_segment = self.dense_segment(hidden_states)
+        context_layer_local = self.dense_local(hidden_states)
+        if attention_mask is not None:
+            context_layer_local.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+            context_layer_segment.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+
+        if self.clsgsepg:
+            # XXX: a trick to make sure the segment and local information will not leak
+            context_layer_local = get_win_max(
+                context_layer_local.masked_fill(
+                    token_type_mask.unsqueeze(dim=-1), -10000))
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+            context_layer_segment.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+            context_layer_local.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+        else:
+            context_layer_local = get_win_max(context_layer_local)
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+        context_layer_local = self.transpose_for_scores(context_layer_local)
+        context_layer_segment = self.transpose_for_scores(
+            context_layer_segment)
+
+        context_layer = (v.unsqueeze(dim=-2) + context_layer_segment
+                         ) * context_layer_o + context_layer_local
+        context_layer = context_layer.permute(0, 2, 1, 3).reshape(
+            *hidden_states.shape[:2], -1)
+
+        if attention_mask is not None:
+            context_layer.masked_fill_(_attention_mask.squeeze(1), 0.0)
+
+        outputs = (context_layer,
+                   att_prob) if output_attentions else (context_layer, )
+        return outputs
+
+
+class PoNetSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class PoNetOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = PoNetSelfAttention(config)
+        self.output = PoNetSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class PoNetLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = PoNetAttention(config)
+
+        config.is_decoder = False  # XXX: Decoder is not yet impletemented.
+        self.is_decoder = config.is_decoder
+
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f'{self} should be used as a decoder model if cross attention is added'
+            self.crossattention = PoNetAttention(config)
+        self.intermediate = PoNetIntermediate(config)
+        self.output = PoNetOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, 'crossattention'
+            ), f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`'  # noqa *
+
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class PoNetEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [PoNetLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if getattr(self.config, 'gradient_checkpointing',
+                       False) and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting '
+                        '`use_cache=False`...')
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class PoNetPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class PoNetPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class PoNetLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = PoNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class PoNetOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class PoNetPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PoNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PoNetConfig
+    load_tf_weights = load_tf_weights_in_ponet
+    base_model_prefix = 'ponet'
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class PoNetForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.PoNetForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Masked language modeling loss.
+        sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            sop loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states
+            (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
+            or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
+            or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    mlm_loss: Optional[torch.FloatTensor] = None
+    sop_loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+PONET_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+PONET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
+    PONET_START_DOCSTRING,
+)
+class PoNetModel(PoNetPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = PoNetEmbeddings(config)
+        self.encoder = PoNetEncoder(config)
+
+        self.pooler = PoNetPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states
+            (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :obj:
+            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        segment_index = get_segment_index(
+            input_ids) if segment_ids is None else segment_ids
+        token_type_mask = get_token_type_mask(input_ids)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            segment_index,
+            token_type_mask,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForPreTraining(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ponet = PoNetModel(config)
+        self.cls = PoNetPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import PoNetTokenizer, PoNetForPreTraining
+            >>> import torch
+
+            >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
+            >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        masked_lm_loss = None
+        next_sentence_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 3),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, masked_lm_loss, next_sentence_loss)
+                    + output) if total_loss is not None else output
+
+        return PoNetForPreTrainingOutput(
+            loss=total_loss,
+            mlm_loss=masked_lm_loss,
+            sop_loss=next_sentence_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
+    PONET_START_DOCSTRING)
+class PoNetLMHeadModel(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:
+            `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :
+            obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top. """,
+    PONET_START_DOCSTRING)
+class PoNetForMaskedLM(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        segment_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForSequenceClassification(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ponet = PoNetModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForTokenClassification(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization_ponet.py
new file mode 100644
index 00000000..21544886
--- /dev/null
+++ b/modelscope/models/nlp/ponet/tokenization_ponet.py
@@ -0,0 +1,155 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for PoNet """
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from transformers.file_utils import PaddingStrategy
+from transformers.models.bert.tokenization_bert import BertTokenizer
+
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'nlp_ponet_fill-mask_chinese-base': 512,
+    'nlp_ponet_fill-mask_english-base': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'nlp_ponet_fill-mask_chinese-base': {
+        'do_lower_case': True
+    },
+    'nlp_ponet_fill-mask_english-base': {
+        'do_lower_case': True
+    },
+}
+
+
+class PoNetTokenizer(BertTokenizer):
+    r"""
+    Construct an PoNet tokenizer. Based on BertTokenizer.
+
+    This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
+            batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning
+            attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = 'attention_mask' in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (
+                max_length % pad_to_multiple_of != 0):
+            max_length = (
+                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
+            required_input) != max_length
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == 'right':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [1] * len(
+                        required_input) + [0] * difference
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = (
+                        encoded_inputs['token_type_ids']
+                        + [self.pad_token_type_id] * difference)
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = encoded_inputs[
+                        'special_tokens_mask'] + [1] * difference
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs[
+                        'segment_ids'] = encoded_inputs['segment_ids'] + [
+                            encoded_inputs['segment_ids'][-1] + 1
+                        ] * difference  # noqa *
+                encoded_inputs[self.model_input_names[
+                    0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == 'left':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [0] * difference + [
+                        1
+                    ] * len(required_input)
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = [
+                        self.pad_token_type_id
+                    ] * difference + encoded_inputs['token_type_ids']
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs['segment_ids'] = [encoded_inputs['segment_ids'][-1] + 1] * difference + \
+                                                    encoded_inputs['segment_ids']  # noqa *
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = [
+                        1
+                    ] * difference + encoded_inputs['special_tokens_mask']
+                encoded_inputs[self.model_input_names[
+                    0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError('Invalid padding strategy:'
+                                 + str(self.padding_side))
+        elif return_attention_mask and 'attention_mask' not in encoded_inputs:
+            encoded_inputs['attention_mask'] = [1] * len(required_input)
+
+        return encoded_inputs
diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py
new file mode 100644
index 00000000..11f4bc11
--- /dev/null
+++ b/modelscope/models/nlp/ponet_for_masked_language.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.ponet import \
+    PoNetForMaskedLM as PoNetForMaskedLMTransformer
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PoNetForMaskedLM']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
+class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
+    """PoNet for MLM model.'.
+
+    Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        PoNetForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                segment_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = PoNetForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(PoNetForMaskedLMTransformer,
+                     PoNetForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 9baeefbb..42dfc972 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .fill_mask_pipeline import FillMaskPipeline
+    from .fill_mask_ponet_pipeline import FillMaskPoNetPreprocessor
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
@@ -36,6 +37,7 @@ else:
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
+        'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
new file mode 100644
index 00000000..0bb72430
--- /dev/null
+++ b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FillMaskPonetPipeline']
+_type_map = {'ponet': 'bert'}
+
+
+@PIPELINES.register_module(
+    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
+class FillMaskPonetPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported fill-mask task,
+            or a fill-mask model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(
+                    'fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+            >>> print(pipeline_ins(input))
+
+            NOTE2: Please pay attention to the model's special tokens.
+            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
+            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
+            To view other examples plese check the tests/pipelines/test_fill_mask.py.
+        """
+        fill_mask_model = model if isinstance(
+            model, Model) else Model.from_pretrained(model)
+
+        self.config = Config.from_file(
+            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
+
+        if preprocessor is None:
+            preprocessor = FillMaskPoNetPreprocessor(
+                fill_mask_model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=None,
+                sequence_length=kwargs.pop('sequence_length', 512))
+
+        fill_mask_model.eval()
+        super().__init__(
+            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+        self.tokenizer = preprocessor.tokenizer
+        self.mask_id = {'roberta': 250001, 'bert': 103}
+
+        self.rep_map = {
+            'bert': {
+                '[unused0]': '',
+                '[PAD]': '',
+                '[unused1]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[unused2]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
+            'roberta': {
+                r' +': ' ',
+                '<mask>': '<q>',
+                '<pad>': '',
+                '<s>': '',
+                '</s>': '',
+                '<unk>': ' '
+            }
+        }
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        import numpy as np
+        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
+        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
+        pred_ids = np.argmax(logits, axis=-1)
+        model_type = self.model.config.model_type
+        process_type = model_type if model_type in self.mask_id else _type_map[
+            model_type]
+        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
+                           input_ids)
+
+        def rep_tokens(string, rep_map):
+            for k, v in rep_map.items():
+                string = string.replace(k, v)
+            return string.strip()
+
+        pred_strings = []
+        for ids in rst_ids:  # batch
+            if 'language' in self.config.model and self.config.model.language == 'zh':
+                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
+                pred_string = ''.join(pred_string)
+            else:
+                pred_string = self.tokenizer.decode(ids)
+            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
+            pred_strings.append(pred_string)
+
+        return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 0123b32e..6012b5ba 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -22,8 +22,8 @@ if TYPE_CHECKING:
         PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
         ZeroShotClassificationPreprocessor, NERPreprocessor,
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
-        SequenceLabelingPreprocessor, RelationExtractionPreprocessor)
-    from .slp import DocumentSegmentationPreprocessor
+        SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
+        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -52,9 +52,9 @@ else:
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
-            'RelationExtractionPreprocessor'
+            'RelationExtractionPreprocessor',
+            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
-        'slp': ['DocumentSegmentationPreprocessor'],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor', 'InputFeatures'
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index aaa83ed1..84e7ca4d 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os.path as osp
+import re
 import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
@@ -11,13 +12,17 @@ from transformers import AutoTokenizer, BertTokenizerFast
 from modelscope.metainfo import Models, Preprocessors
 from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
-from modelscope.utils.config import ConfigFields
-from modelscope.utils.constant import Fields, InputFields, ModeKeys
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.nlp_utils import import_external_nltk_data
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 
+logger = get_logger()
+
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
@@ -25,7 +30,8 @@ __all__ = [
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
-    'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor'
+    'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor',
+    'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
 ]
 
 
@@ -903,3 +909,297 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor):
             max_length = self.MAX_LEN
         return self.tokenizer.batch_encode_plus(
             sentence_list, padding=True, max_length=max_length)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            logger.error(e)
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
+class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.language = self.cfg.model.get('language', 'en')
+        if self.language == 'en':
+            from nltk.tokenize import sent_tokenize
+            import_external_nltk_data(
+                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
+        elif self.language in ['zh', 'cn']:
+
+            def sent_tokenize(para):
+                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
+                              para)  # noqa *
+                para = para.rstrip()
+                return [_ for _ in para.split('\n') if _]
+        else:
+            raise NotImplementedError
+
+        self.sent_tokenize = sent_tokenize
+        self.max_length = kwargs['max_length']
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        max_seq_length = self.max_length
+
+        if text_b is None:
+            segment_ids = []
+            seg_lens = list(
+                map(
+                    len,
+                    self.tokenizer(
+                        self.sent_tokenize(text_a),
+                        add_special_tokens=False,
+                        truncation=True)['input_ids']))
+            segment_id = [0] + sum(
+                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
+            segment_id = segment_id[:max_seq_length - 1]
+            segment_ids.append(segment_id + [segment_id[-1] + 1]
+                               * (max_seq_length - len(segment_id)))
+            output['segment_ids'] = segment_ids
+
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+
+        self.labels_to_id(labels, output)
+        return output
diff --git a/modelscope/preprocessors/slp.py b/modelscope/preprocessors/slp.py
deleted file mode 100644
index d9c2d9b7..00000000
--- a/modelscope/preprocessors/slp.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-from transformers import BertTokenizerFast
-
-from modelscope.metainfo import Preprocessors
-from modelscope.utils.constant import Fields
-from modelscope.utils.hub import get_model_type, parse_label_mapping
-from modelscope.utils.type_assert import type_assert
-from .base import Preprocessor
-from .builder import PREPROCESSORS
-
-__all__ = ['DocumentSegmentationPreprocessor']
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.document_segmentation)
-class DocumentSegmentationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, config, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.tokenizer = BertTokenizerFast.from_pretrained(
-            model_dir,
-            use_fast=True,
-        )
-        self.question_column_name = 'labels'
-        self.context_column_name = 'sentences'
-        self.example_id_column_name = 'example_id'
-        self.label_to_id = {'B-EOP': 0, 'O': 1}
-        self.target_specical_ids = set()
-        self.target_specical_ids.add(self.tokenizer.eos_token_id)
-        self.max_seq_length = config.max_position_embeddings
-        self.label_list = ['B-EOP', 'O']
-
-    def __call__(self, examples) -> Dict[str, Any]:
-        questions = examples[self.question_column_name]
-        contexts = examples[self.context_column_name]
-        example_ids = examples[self.example_id_column_name]
-        num_examples = len(questions)
-
-        sentences = []
-        for sentence_list in contexts:
-            sentence_list = [_ + '[EOS]' for _ in sentence_list]
-            sentences.append(sentence_list)
-
-        try:
-            tokenized_examples = self.tokenizer(
-                sentences,
-                is_split_into_words=True,
-                add_special_tokens=False,
-                return_token_type_ids=True,
-                return_attention_mask=True,
-            )
-        except Exception as e:
-            print(str(e))
-            return {}
-
-        segment_ids = []
-        token_seq_labels = []
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_labels = questions[example_index]
-            example_labels = [
-                self.label_to_id[_] if _ in self.label_to_id else -100
-                for _ in example_labels
-            ]
-            example_token_labels = []
-            segment_id = []
-            cur_seg_id = 1
-            for token_index in range(len(example_input_ids)):
-                if example_input_ids[token_index] in self.target_specical_ids:
-                    example_token_labels.append(example_labels[cur_seg_id - 1])
-                    segment_id.append(cur_seg_id)
-                    cur_seg_id += 1
-                else:
-                    example_token_labels.append(-100)
-                    segment_id.append(cur_seg_id)
-
-            segment_ids.append(segment_id)
-            token_seq_labels.append(example_token_labels)
-
-        tokenized_examples['segment_ids'] = segment_ids
-        tokenized_examples['token_seq_labels'] = token_seq_labels
-
-        new_segment_ids = []
-        new_token_seq_labels = []
-        new_input_ids = []
-        new_token_type_ids = []
-        new_attention_mask = []
-        new_example_ids = []
-        new_sentences = []
-
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_token_type_ids = tokenized_examples['token_type_ids'][
-                example_index]
-            example_attention_mask = tokenized_examples['attention_mask'][
-                example_index]
-            example_segment_ids = tokenized_examples['segment_ids'][
-                example_index]
-            example_token_seq_labels = tokenized_examples['token_seq_labels'][
-                example_index]
-            example_sentences = contexts[example_index]
-            example_id = example_ids[example_index]
-            example_total_num_sentences = len(questions[example_index])
-            example_total_num_tokens = len(
-                tokenized_examples['input_ids'][example_index])
-            accumulate_length = [
-                i for i, x in enumerate(tokenized_examples['input_ids']
-                                        [example_index])
-                if x == self.tokenizer.eos_token_id
-            ]
-            samples_boundary = []
-            left_index = 0
-            sent_left_index = 0
-            sent_i = 0
-
-            # for sent_i, length in enumerate(accumulate_length):
-            while sent_i < len(accumulate_length):
-                length = accumulate_length[sent_i]
-                right_index = length + 1
-                sent_right_index = sent_i + 1
-                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
-                    samples_boundary.append([left_index, right_index])
-
-                    sample_input_ids = [
-                        self.tokenizer.cls_token_id
-                    ] + example_input_ids[left_index:right_index]
-                    sample_input_ids = sample_input_ids[:self.max_seq_length]
-
-                    sample_token_type_ids = [
-                        0
-                    ] + example_token_type_ids[left_index:right_index]
-                    sample_token_type_ids = sample_token_type_ids[:self.
-                                                                  max_seq_length]
-
-                    sample_attention_mask = [
-                        1
-                    ] + example_attention_mask[left_index:right_index]
-                    sample_attention_mask = sample_attention_mask[:self.
-                                                                  max_seq_length]
-
-                    sample_segment_ids = [
-                        0
-                    ] + example_segment_ids[left_index:right_index]
-                    sample_segment_ids = sample_segment_ids[:self.
-                                                            max_seq_length]
-
-                    sample_token_seq_labels = [
-                        -100
-                    ] + example_token_seq_labels[left_index:right_index]
-                    sample_token_seq_labels = sample_token_seq_labels[:self.
-                                                                      max_seq_length]
-
-                    if sent_right_index - 1 == sent_left_index:
-                        left_index = right_index
-                        sample_input_ids[-1] = self.tokenizer.eos_token_id
-                        sample_token_seq_labels[-1] = -100
-                    else:
-                        left_index = accumulate_length[sent_i - 1] + 1
-                        if sample_token_seq_labels[-1] != -100:
-                            sample_token_seq_labels[-1] = -100
-
-                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index]
-                        sent_left_index = sent_right_index
-                        sent_i += 1
-                    else:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index - 1]
-                        sent_left_index = sent_right_index - 1
-
-                    if (len([_ for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences) - 1 and (len([
-                                 _
-                                 for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences):
-                        tmp = []
-                        for w_i, w, l in zip(
-                                sample_input_ids,
-                                self.tokenizer.decode(sample_input_ids).split(
-                                    ' '), sample_token_seq_labels):
-                            tmp.append((w_i, w, l))
-                    while len(sample_input_ids) < self.max_seq_length:
-                        sample_input_ids.append(self.tokenizer.pad_token_id)
-                        sample_token_type_ids.append(0)
-                        sample_attention_mask.append(0)
-                        sample_segment_ids.append(example_total_num_sentences
-                                                  + 1)
-                        sample_token_seq_labels.append(-100)
-
-                    new_input_ids.append(sample_input_ids)
-                    new_token_type_ids.append(sample_token_type_ids)
-                    new_attention_mask.append(sample_attention_mask)
-                    new_segment_ids.append(sample_segment_ids)
-                    new_token_seq_labels.append(sample_token_seq_labels)
-                    new_example_ids.append(example_id)
-                    new_sentences.append(sample_sentences)
-                else:
-                    sent_i += 1
-                    continue
-
-        output_samples = {}
-
-        output_samples['input_ids'] = new_input_ids
-        output_samples['token_type_ids'] = new_token_type_ids
-        output_samples['attention_mask'] = new_attention_mask
-
-        output_samples['segment_ids'] = new_segment_ids
-        output_samples['example_id'] = new_example_ids
-        output_samples['labels'] = new_token_seq_labels
-        output_samples['sentences'] = new_sentences
-
-        return output_samples
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 35b374f2..64b12007 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -1,3 +1,4 @@
+import os.path as osp
 from typing import List
 
 from modelscope.outputs import OutputKeys
@@ -41,3 +42,22 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
+def import_external_nltk_data(nltk_data_dir, package_name):
+    """import external nltk_data, and extract nltk zip package.
+
+    Args:
+        nltk_data_dir (str): external nltk_data dir path, eg. /home/xx/nltk_data
+        package_name (str): nltk package name, eg. tokenizers/punkt
+    """
+    import nltk
+    nltk.data.path.append(nltk_data_dir)
+
+    filepath = osp.join(nltk_data_dir, package_name + '.zip')
+    zippath = osp.join(nltk_data_dir, package_name)
+    packagepath = osp.dirname(zippath)
+    if not osp.exists(zippath):
+        import zipfile
+        with zipfile.ZipFile(filepath) as zf:
+            zf.extractall(osp.join(packagepath))
diff --git a/tests/pipelines/test_fill_mask_ponet.py b/tests/pipelines/test_fill_mask_ponet.py
new file mode 100644
index 00000000..707cc201
--- /dev/null
+++ b/tests/pipelines/test_fill_mask_ponet.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FillMaskPonetTest(unittest.TestCase):
+    model_id_ponet = {
+        'zh': 'damo/nlp_ponet_fill-mask_chinese-base',
+        'en': 'damo/nlp_ponet_fill-mask_english-base'
+    }
+
+    ori_texts = {
+        'zh':
+        '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。'
+        '你师父差得动你，你师父可差不动我。',
+        'en':
+        'Everything in what you call reality is really just a reflection of your '
+        'consciousness. Your whole universe is just a mirror reflection of your story.'
+    }
+
+    test_inputs = {
+        'zh':
+        '段誉轻[MASK]折扇，摇了摇[MASK]，[MASK]道：“你师父是你的[MASK][MASK]，你'
+        '师父可不是[MASK]的师父。你师父差得动你，你师父可[MASK]不动我。',
+        'en':
+        'Everything in [MASK] you call reality is really [MASK] a reflection of your '
+        '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
+    }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_ponet_model(self):
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language]
+
+            pipeline_ins = pipeline(
+                task=Tasks.fill_mask, model=self.model_id_ponet[language])
+
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()

From ad6bb1e7d960c636ba3f21fd05cdffb09b916217 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 7 Sep 2022 20:51:15 +0800
Subject: [PATCH 077/175] [to #44790143]fix: add ipythonkernel to image for dsw
         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10049527

    * add ipykernel to image for dsw
---
 docker/Dockerfile.ubuntu | 4 +++-
 tests/run.py             | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 78da0b6f..e0bfa908 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -75,7 +75,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
diff --git a/tests/run.py b/tests/run.py
index 51a563fe..18839622 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -420,7 +420,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--suites',
         nargs='*',
-        help='Run specified test suites(test suite file list)')
+        help='Run specified test suites(test suite files list split by space)')
     args = parser.parse_args()
     set_test_level(args.level)
     os.environ['REGRESSION_BASELINE'] = '1'

From 7b23c417484f038cd0c7fcd76f2ca95e677cac94 Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Wed, 7 Sep 2022 21:06:25 +0800
Subject: [PATCH 078/175] [to #42322933] Add video-inpainting files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

视频编辑的cr
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10026166
---
 .../test/videos/mask_dir/mask_00000_00320.png |   3 +
 .../test/videos/mask_dir/mask_00321_00633.png |   3 +
 data/test/videos/video_inpainting_test.mp4    |   3 +
 modelscope/metainfo.py                        |   2 +
 .../models/cv/video_inpainting/__init__.py    |  20 +
 .../models/cv/video_inpainting/inpainting.py  | 298 ++++++++++++++
 .../cv/video_inpainting/inpainting_model.py   | 373 ++++++++++++++++++
 modelscope/outputs.py                         |   5 +
 modelscope/pipelines/builder.py               |   2 +
 .../pipelines/cv/video_inpainting_pipeline.py |  47 +++
 modelscope/utils/constant.py                  |   3 +
 tests/pipelines/test_person_image_cartoon.py  |   1 -
 tests/pipelines/test_video_inpainting.py      |  39 ++
 13 files changed, 798 insertions(+), 1 deletion(-)
 create mode 100644 data/test/videos/mask_dir/mask_00000_00320.png
 create mode 100644 data/test/videos/mask_dir/mask_00321_00633.png
 create mode 100644 data/test/videos/video_inpainting_test.mp4
 create mode 100644 modelscope/models/cv/video_inpainting/__init__.py
 create mode 100644 modelscope/models/cv/video_inpainting/inpainting.py
 create mode 100644 modelscope/models/cv/video_inpainting/inpainting_model.py
 create mode 100644 modelscope/pipelines/cv/video_inpainting_pipeline.py
 create mode 100644 tests/pipelines/test_video_inpainting.py

diff --git a/data/test/videos/mask_dir/mask_00000_00320.png b/data/test/videos/mask_dir/mask_00000_00320.png
new file mode 100644
index 00000000..2eae71a1
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00000_00320.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b158f6029d9763d7f84042f7c5835f398c688fdbb6b3f4fe6431101d4118c66c
+size 2766
diff --git a/data/test/videos/mask_dir/mask_00321_00633.png b/data/test/videos/mask_dir/mask_00321_00633.png
new file mode 100644
index 00000000..89633eb6
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00321_00633.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dcf46b93077e2229ab69cd6ddb80e2689546c575ee538bb2033fee1124ef3e3
+size 2761
diff --git a/data/test/videos/video_inpainting_test.mp4 b/data/test/videos/video_inpainting_test.mp4
new file mode 100644
index 00000000..61f96fac
--- /dev/null
+++ b/data/test/videos/video_inpainting_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9870df5a86acaaec67063183dace795479cd0f05296f13058995f475149c56
+size 2957783
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index f904b5df..1bb2c389 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -38,6 +38,7 @@ class Models(object):
     mogface = 'mogface'
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
+    video_inpainting = 'video-inpainting'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -169,6 +170,7 @@ class Pipelines(object):
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
+    video_inpainting = 'video-inpainting'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/video_inpainting/__init__.py b/modelscope/models/cv/video_inpainting/__init__.py
new file mode 100644
index 00000000..fd93fe3c
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/__init__.py
@@ -0,0 +1,20 @@
+# copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .inpainting_model import VideoInpainting
+
+else:
+    _import_structure = {'inpainting_model': ['VideoInpainting']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_inpainting/inpainting.py b/modelscope/models/cv/video_inpainting/inpainting.py
new file mode 100644
index 00000000..9632e01c
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting.py
@@ -0,0 +1,298 @@
+""" VideoInpaintingProcess
+Base modules are adapted from https://github.com/researchmm/STTN,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+"""
+
+import os
+import time
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+torch.backends.cudnn.enabled = False
+
+w, h = 192, 96
+ref_length = 300
+neighbor_stride = 20
+default_fps = 24
+MAX_frame = 300
+
+
+def video_process(video_input_path):
+    video_input = cv2.VideoCapture(video_input_path)
+    success, frame = video_input.read()
+    if success is False:
+        decode_error = 'decode_error'
+        w, h, fps = 0, 0, 0
+    else:
+        decode_error = None
+        h, w = frame.shape[0:2]
+        fps = video_input.get(cv2.CAP_PROP_FPS)
+    video_input.release()
+
+    return decode_error, fps, w, h
+
+
+class Stack(object):
+
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group],
+                                axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError(f'Image mode {mode}')
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+
+
+_to_tensors = transforms.Compose([Stack(), ToTorchFormatTensor()])
+
+
+def get_crop_mask_v1(mask):
+    orig_h, orig_w, _ = mask.shape
+    if (mask == 255).all():
+        return mask, (0, int(orig_h), 0,
+                      int(orig_w)), [0, int(orig_h), 0,
+                                     int(orig_w)
+                                     ], [0, int(orig_h), 0,
+                                         int(orig_w)]
+
+    hs = np.min(np.where(mask == 0)[0])
+    he = np.max(np.where(mask == 0)[0])
+    ws = np.min(np.where(mask == 0)[1])
+    we = np.max(np.where(mask == 0)[1])
+    crop_box = [ws, hs, we, he]
+
+    mask_h = round(int(orig_h / 2) / 4) * 4
+    mask_w = round(int(orig_w / 2) / 4) * 4
+
+    if (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:mask_h, :mask_w, :]
+        res_pix = (0, mask_h, 0, mask_w)
+    elif (hs < mask_h) and (he < mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, orig_w - mask_w:orig_w, :]
+        res_pix = (0, mask_h, orig_w - mask_w, int(orig_w))
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :mask_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, orig_w - mask_w:orig_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), orig_w - mask_w, int(orig_w))
+
+    elif (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, :, :]
+        res_pix = (0, mask_h, 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:, :mask_w, :]
+        res_pix = (0, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:, orig_w - mask_w:orig_w, :]
+        res_pix = (0, int(orig_h), orig_w - mask_w, int(orig_w))
+    else:
+        crop_mask = mask
+        res_pix = (0, int(orig_h), 0, int(orig_w))
+    a = ws - res_pix[2]
+    b = hs - res_pix[0]
+    c = we - res_pix[2]
+    d = he - res_pix[0]
+    return crop_mask, res_pix, crop_box, [a, b, c, d]
+
+
+def get_ref_index(neighbor_ids, length):
+    ref_index = []
+    for i in range(0, length, ref_length):
+        if i not in neighbor_ids:
+            ref_index.append(i)
+    return ref_index
+
+
+def read_mask_oneImage(mpath):
+    masks = []
+    print('mask_path: {}'.format(mpath))
+    start = int(mpath.split('/')[-1].split('mask_')[1].split('_')[0])
+    end = int(
+        mpath.split('/')[-1].split('mask_')[1].split('_')[1].split('.')[0])
+    m = Image.open(mpath)
+    m = np.array(m.convert('L'))
+    m = np.array(m > 0).astype(np.uint8)
+    m = 1 - m
+    for i in range(start - 1, end + 1):
+        masks.append(Image.fromarray(m * 255))
+    return masks
+
+
+def check_size(h, w):
+    is_resize = False
+    if h != 240:
+        h = 240
+        is_resize = True
+    if w != 432:
+        w = 432
+        is_resize = True
+    return is_resize
+
+
+def get_mask_list(mask_path):
+    mask_names = os.listdir(mask_path)
+    mask_names.sort()
+
+    abs_mask_path = []
+    mask_list = []
+    begin_list = []
+    end_list = []
+
+    for mask_name in mask_names:
+        mask_name_tmp = mask_name.split('mask_')[1]
+        begin_list.append(int(mask_name_tmp.split('_')[0]))
+        end_list.append(int(mask_name_tmp.split('_')[1].split('.')[0]))
+        abs_mask_path.append(os.path.join(mask_path, mask_name))
+        mask = cv2.imread(os.path.join(mask_path, mask_name))
+        mask_list.append(mask)
+    return mask_list, begin_list, end_list, abs_mask_path
+
+
+def inpainting_by_model_balance(model, video_inputPath, mask_path,
+                                video_savePath, fps, w_ori, h_ori):
+
+    video_ori = cv2.VideoCapture(video_inputPath)
+
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_save = cv2.VideoWriter(video_savePath, fourcc, fps, (w_ori, h_ori))
+
+    mask_list, begin_list, end_list, abs_mask_path = get_mask_list(mask_path)
+
+    img_npy = []
+
+    for index, mask in enumerate(mask_list):
+
+        masks = read_mask_oneImage(abs_mask_path[index])
+
+        mask, res_pix, crop_for_oriimg, crop_for_inpimg = get_crop_mask_v1(
+            mask)
+        mask_h, mask_w = mask.shape[0:2]
+        is_resize = check_size(mask.shape[0], mask.shape[1])
+
+        begin = begin_list[index]
+        end = end_list[index]
+        print('begin: {}'.format(begin))
+        print('end: {}'.format(end))
+
+        for i in range(begin, end + 1, MAX_frame):
+            begin_time = time.time()
+            if i + MAX_frame <= end:
+                video_length = MAX_frame
+            else:
+                video_length = end - i + 1
+
+            for frame_count in range(video_length):
+                _, frame = video_ori.read()
+                img_npy.append(frame)
+            frames_temp = []
+            for f in img_npy:
+                f = Image.fromarray(f)
+                i_temp = f.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                a = i_temp.resize((w, h), Image.NEAREST)
+                frames_temp.append(a)
+            feats_temp = _to_tensors(frames_temp).unsqueeze(0) * 2 - 1
+            frames_temp = [np.array(f).astype(np.uint8) for f in frames_temp]
+            masks_temp = []
+            for m in masks[i - begin:i + video_length - begin]:
+
+                m_temp = m.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                b = m_temp.resize((w, h), Image.NEAREST)
+                masks_temp.append(b)
+            binary_masks_temp = [
+                np.expand_dims((np.array(m) != 0).astype(np.uint8), 2)
+                for m in masks_temp
+            ]
+            masks_temp = _to_tensors(masks_temp).unsqueeze(0)
+            feats_temp, masks_temp = feats_temp.cuda(), masks_temp.cuda()
+            comp_frames = [None] * video_length
+            model.eval()
+            with torch.no_grad():
+                feats_out = feats_temp * (1 - masks_temp).float()
+                feats_out = feats_out.view(video_length, 3, h, w)
+                feats_out = model.model.encoder(feats_out)
+                _, c, feat_h, feat_w = feats_out.size()
+                feats_out = feats_out.view(1, video_length, c, feat_h, feat_w)
+
+            for f in range(0, video_length, neighbor_stride):
+                neighbor_ids = [
+                    i for i in range(
+                        max(0, f - neighbor_stride),
+                        min(video_length, f + neighbor_stride + 1))
+                ]
+                ref_ids = get_ref_index(neighbor_ids, video_length)
+                with torch.no_grad():
+                    pred_feat = model.model.infer(
+                        feats_out[0, neighbor_ids + ref_ids, :, :, :],
+                        masks_temp[0, neighbor_ids + ref_ids, :, :, :])
+                    pred_img = torch.tanh(
+                        model.model.decoder(
+                            pred_feat[:len(neighbor_ids), :, :, :])).detach()
+                    pred_img = (pred_img + 1) / 2
+                    pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
+                    for j in range(len(neighbor_ids)):
+                        idx = neighbor_ids[j]
+                        img = np.array(pred_img[j]).astype(
+                            np.uint8) * binary_masks_temp[idx] + frames_temp[
+                                idx] * (1 - binary_masks_temp[idx])
+                        if comp_frames[idx] is None:
+                            comp_frames[idx] = img
+                        else:
+                            comp_frames[idx] = comp_frames[idx].astype(
+                                np.float32) * 0.5 + img.astype(
+                                    np.float32) * 0.5
+            print('inpainting time:', time.time() - begin_time)
+            for f in range(video_length):
+                comp = np.array(comp_frames[f]).astype(
+                    np.uint8) * binary_masks_temp[f] + frames_temp[f] * (
+                        1 - binary_masks_temp[f])
+                if is_resize:
+                    comp = cv2.resize(comp, (mask_w, mask_h))
+                complete_frame = img_npy[f]
+                a1, b1, c1, d1 = crop_for_oriimg
+                a2, b2, c2, d2 = crop_for_inpimg
+                complete_frame[b1:d1, a1:c1] = comp[b2:d2, a2:c2]
+                video_save.write(complete_frame)
+
+            img_npy = []
+
+    video_ori.release()
diff --git a/modelscope/models/cv/video_inpainting/inpainting_model.py b/modelscope/models/cv/video_inpainting/inpainting_model.py
new file mode 100644
index 00000000..a791b0ab
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting_model.py
@@ -0,0 +1,373 @@
+""" VideoInpaintingNetwork
+Base modules are adapted from https://github.com/researchmm/STTN,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class BaseNetwork(nn.Module):
+
+    def __init__(self):
+        super(BaseNetwork, self).__init__()
+
+    def print_network(self):
+        if isinstance(self, list):
+            self = self[0]
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(
+            'Network [%s] was created. Total number of parameters: %.1f million. '
+            'To see the architecture, do print(network).' %
+            (type(self).__name__, num_params / 1000000))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+
+        def init_func(m):
+            classname = m.__class__.__name__
+            if classname.find('InstanceNorm2d') != -1:
+                if hasattr(m, 'weight') and m.weight is not None:
+                    nn.init.constant_(m.weight.data, 1.0)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                           or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'xavier_uniform':
+                    nn.init.xavier_uniform_(m.weight.data, gain=1.0)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                elif init_type == 'none':
+                    m.reset_parameters()
+                else:
+                    raise NotImplementedError(
+                        'initialization method [%s] is not implemented'
+                        % init_type)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+        for m in self.children():
+            if hasattr(m, 'init_weights'):
+                m.init_weights(init_type, gain)
+
+
+@MODELS.register_module(
+    Tasks.video_inpainting, module_name=Models.video_inpainting)
+class VideoInpainting(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = InpaintGenerator()
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.load_state_dict(pretrained_params['netG'])
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+
+class InpaintGenerator(BaseNetwork):
+
+    def __init__(self, init_weights=True):
+        super(InpaintGenerator, self).__init__()
+        channel = 256
+        stack_num = 6
+        patchsize = [(48, 24), (16, 8), (8, 4), (4, 2)]
+        blocks = []
+        for _ in range(stack_num):
+            blocks.append(TransformerBlock(patchsize, hidden=channel))
+        self.transformer = nn.Sequential(*blocks)
+
+        self.encoder = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, channel, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+
+        self.decoder = nn.Sequential(
+            deconv(channel, 128, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            deconv(64, 64, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, masked_frames, masks):
+        b, t, c, h, w = masked_frames.size()
+        masks = masks.view(b * t, 1, h, w)
+        enc_feat = self.encoder(masked_frames.view(b * t, c, h, w))
+        _, c, h, w = enc_feat.size()
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        enc_feat = self.transformer({
+            'x': enc_feat,
+            'm': masks,
+            'b': b,
+            'c': c
+        })['x']
+        output = self.decoder(enc_feat)
+        output = torch.tanh(output)
+        return output
+
+    def infer(self, feat, masks):
+        t, c, h, w = masks.size()
+        masks = masks.view(t, c, h, w)
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        t, c, _, _ = feat.size()
+        enc_feat = self.transformer({
+            'x': feat,
+            'm': masks,
+            'b': 1,
+            'c': c
+        })['x']
+        return enc_feat
+
+
+class deconv(nn.Module):
+
+    def __init__(self,
+                 input_channel,
+                 output_channel,
+                 kernel_size=3,
+                 padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            input_channel,
+            output_channel,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = self.conv(x)
+        return x
+
+
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+
+    def forward(self, query, key, value, m):
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
+            query.size(-1))
+        scores.masked_fill(m, -1e9)
+        p_attn = F.softmax(scores, dim=-1)
+        p_val = torch.matmul(p_attn, value)
+        return p_val, p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Take in model size and number of heads.
+    """
+
+    def __init__(self, patchsize, d_model):
+        super().__init__()
+        self.patchsize = patchsize
+        self.query_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.value_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.key_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.output_linear = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+        self.attention = Attention()
+
+    def forward(self, x, m, b, c):
+        bt, _, h, w = x.size()
+        t = bt // b
+        d_k = c // len(self.patchsize)
+        output = []
+        _query = self.query_embedding(x)
+        _key = self.key_embedding(x)
+        _value = self.value_embedding(x)
+        for (width, height), query, key, value in zip(
+                self.patchsize,
+                torch.chunk(_query, len(self.patchsize), dim=1),
+                torch.chunk(_key, len(self.patchsize), dim=1),
+                torch.chunk(_value, len(self.patchsize), dim=1)):
+            out_w, out_h = w // width, h // height
+            mm = m.view(b, t, 1, out_h, height, out_w, width)
+            mm = mm.permute(0, 1, 3, 5, 2, 4,
+                            6).contiguous().view(b, t * out_h * out_w,
+                                                 height * width)
+            mm = (mm.mean(-1) > 0.5).unsqueeze(1).repeat(
+                1, t * out_h * out_w, 1)
+            query = query.view(b, t, d_k, out_h, height, out_w, width)
+            query = query.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            key = key.view(b, t, d_k, out_h, height, out_w, width)
+            key = key.permute(0, 1, 3, 5, 2, 4,
+                              6).contiguous().view(b, t * out_h * out_w,
+                                                   d_k * height * width)
+            value = value.view(b, t, d_k, out_h, height, out_w, width)
+            value = value.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            y, _ = self.attention(query, key, value, mm)
+            y = y.view(b, t, out_h, out_w, d_k, height, width)
+            y = y.permute(0, 1, 4, 2, 5, 3, 6).contiguous().view(bt, d_k, h, w)
+            output.append(y)
+        output = torch.cat(output, 1)
+        x = self.output_linear(output)
+        return x
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, d_model):
+        super(FeedForward, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=2, dilation=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
+    """
+
+    def __init__(self, patchsize, hidden=128):  # hidden=128
+        super().__init__()
+        self.attention = MultiHeadedAttention(patchsize, d_model=hidden)
+        self.feed_forward = FeedForward(hidden)
+
+    def forward(self, x):
+        x, m, b, c = x['x'], x['m'], x['b'], x['c']
+        x = x + self.attention(x, m, b, c)
+        x = x + self.feed_forward(x)
+        return {'x': x, 'm': m, 'b': b, 'c': c}
+
+
+class Discriminator(BaseNetwork):
+
+    def __init__(self,
+                 in_channels=3,
+                 use_sigmoid=False,
+                 use_spectral_norm=True,
+                 init_weights=True):
+        super(Discriminator, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        nf = 64
+
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(
+                    in_channels=in_channels,
+                    out_channels=nf * 1,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=1,
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 1,
+                    nf * 2,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 2,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(
+                nf * 4,
+                nf * 4,
+                kernel_size=(3, 5, 5),
+                stride=(1, 2, 2),
+                padding=(1, 2, 2)))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, xs):
+        xs_t = torch.transpose(xs, 0, 1)
+        xs_t = xs_t.unsqueeze(0)
+        feat = self.conv(xs_t)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)
+        return out
+
+
+def spectral_norm(module, mode=True):
+    if mode:
+        return _spectral_norm(module)
+    return module
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 6c7500bb..37ab3481 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -610,4 +610,9 @@ TASK_OUTPUTS = {
     #       "img_embedding": np.array with shape [1, D],
     #   }
     Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
+
+    # {
+    #     'output': ['Done' / 'Decode_Error']
+    # }
+    Tasks.video_inpainting: [OutputKeys.OUTPUT]
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index fa79ca11..a1f093a3 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -168,6 +168,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
     Tasks.shop_segmentation: (Pipelines.shop_segmentation,
                               'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.video_inpainting: (Pipelines.video_inpainting,
+                             'damo/cv_video-inpainting'),
 }
 
 
diff --git a/modelscope/pipelines/cv/video_inpainting_pipeline.py b/modelscope/pipelines/cv/video_inpainting_pipeline.py
new file mode 100644
index 00000000..15444e05
--- /dev/null
+++ b/modelscope/pipelines/cv/video_inpainting_pipeline.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_inpainting import inpainting
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_inpainting, module_name=Pipelines.video_inpainting)
+class VideoInpaintingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create video inpainting pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        decode_error, fps, w, h = inpainting.video_process(
+            input['video_input_path'])
+
+        if decode_error is not None:
+            return {OutputKeys.OUTPUT: 'decode_error'}
+
+        inpainting.inpainting_by_model_balance(self.model,
+                                               input['video_input_path'],
+                                               input['mask_path'],
+                                               input['video_output_path'], fps,
+                                               w, h)
+
+        return {OutputKeys.OUTPUT: 'Done'}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 47d38dd7..8fb00ed6 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -70,6 +70,9 @@ class CVTasks(object):
     crowd_counting = 'crowd-counting'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
+    # video editing
+    video_inpainting = 'video-inpainting'
+
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
     video_summarization = 'video-summarization'
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index bdbf8b61..90aaa500 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import os.path as osp
 import unittest
 
diff --git a/tests/pipelines/test_video_inpainting.py b/tests/pipelines/test_video_inpainting.py
new file mode 100644
index 00000000..8364b1b3
--- /dev/null
+++ b/tests/pipelines/test_video_inpainting.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VideoInpaintingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_video-inpainting'
+        self.mask_dir = 'data/test/videos/mask_dir'
+        self.video_in = 'data/test/videos/video_inpainting_test.mp4'
+        self.video_out = 'out.mp4'
+        self.input = {
+            'video_input_path': self.video_in,
+            'video_output_path': self.video_out,
+            'mask_path': self.mask_dir
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        video_inpainting = pipeline(Tasks.video_inpainting, model=self.model)
+        self.pipeline_inference(video_inpainting, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_inpainting = pipeline(Tasks.video_inpainting)
+        self.pipeline_inference(video_inpainting, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6fd5f671fa698e57d899498e6bdf5a5fec7d8d67 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Thu, 8 Sep 2022 11:00:35 +0800
Subject: [PATCH 079/175] [to #42322933] add httpurl support for ASR        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10043608

---
 .../pipelines/audio/asr_inference_pipeline.py | 15 ++++--
 modelscope/utils/audio/audio_utils.py         | 44 ++++++++++++++++++
 .../test_automatic_speech_recognition.py      | 46 +++++++++++--------
 3 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index b321b770..282d1184 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import yaml
@@ -9,6 +8,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToScp
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 
@@ -41,12 +42,20 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         self.recog_type = recog_type
         self.audio_format = audio_format
-        self.audio_in = audio_in
         self.audio_fs = audio_fs
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            self.audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            self.audio_in = extract_pcm_from_wav(audio_in)
+        else:
+            self.audio_in = audio_in
+
         if recog_type is None or audio_format is None:
             self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=audio_in,
+                audio_in=self.audio_in,
                 recog_type=recog_type,
                 audio_format=audio_format)
 
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 61964345..c93e0102 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,4 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import struct
+from typing import Union
+from urllib.parse import urlparse
+
+from modelscope.fileio.file import HTTPStorage
+
 SEGMENT_LENGTH_TRAIN = 16000
 
 
@@ -29,3 +35,41 @@ def audio_norm(x):
     scalarx = 10**(-25 / 20) / rmsx
     x = x * scalarx
     return x
+
+
+def extract_pcm_from_wav(wav: bytes) -> bytes:
+    data = wav
+    if len(data) > 44:
+        frame_len = 44
+        file_len = len(data)
+        header_fields = {}
+        header_fields['ChunkID'] = str(data[0:4], 'UTF-8')
+        header_fields['Format'] = str(data[8:12], 'UTF-8')
+        header_fields['Subchunk1ID'] = str(data[12:16], 'UTF-8')
+        if header_fields['ChunkID'] == 'RIFF' and header_fields[
+                'Format'] == 'WAVE' and header_fields['Subchunk1ID'] == 'fmt ':
+            header_fields['SubChunk1Size'] = struct.unpack('<I',
+                                                           data[16:20])[0]
+
+            if header_fields['SubChunk1Size'] == 16:
+                frame_len = 44
+            elif header_fields['SubChunk1Size'] == 18:
+                frame_len = 46
+            else:
+                return data
+
+            data = wav[frame_len:file_len]
+
+    return data
+
+
+def load_bytes_from_url(url: str) -> Union[bytes, str]:
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        data = storage.read(url)
+        data = extract_pcm_from_wav(data)
+    else:
+        data = url
+
+    return data
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index a83f5031..7f4ce88e 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -16,16 +16,11 @@ from modelscope.utils.test_utils import download_and_untar, test_level
 logger = get_logger()
 
 WAV_FILE = 'data/test/audios/asr_example.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
 
 LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz'
 LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz'
 
-AISHELL1_TESTSETS_FILE = 'aishell1.tar.gz'
-AISHELL1_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/aishell1.tar.gz'
-
-TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz'
-TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz'
-
 
 class AutomaticSpeechRecognitionTest(unittest.TestCase):
     action_info = {
@@ -45,6 +40,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'wav_example'
         },
+        'test_run_with_url_tf': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
         'test_run_with_wav_dataset_pytorch': {
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
@@ -132,8 +131,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_pytorch(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (pytorch)...')
 
@@ -145,8 +144,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_pytorch(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (pytorch)...')
 
@@ -158,8 +157,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_tf(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (tensorflow)...')
 
@@ -171,8 +170,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_tf(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (tensorflow)...')
 
@@ -182,9 +181,20 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=audio, sr=sr)
         self.check_result('test_run_with_pcm_tf', rec_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url_tf(self):
+        """run with single url file
+        """
+
+        logger.info('Run ASR test with url file (tensorflow)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_tf_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url_tf', rec_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_pytorch(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -199,7 +209,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (pytorch)...')
         logger.info('Downloading waveform testsets file ...')
@@ -215,7 +225,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_tf(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -230,7 +240,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (tensorflow)...')
         logger.info('Downloading waveform testsets file ...')

From d4759e4c242971e7c7a5610f10307f8abfd158ee Mon Sep 17 00:00:00 2001
From: cyc385202 <cyc385202@alibaba-inc.com>
Date: Thu, 8 Sep 2022 13:45:14 +0800
Subject: [PATCH 080/175] =?UTF-8?q?[to=20#42322933]=20=E5=8A=A0=E5=85=A5sp?=
 =?UTF-8?q?ace=E6=A8=A1=E5=9E=8B=E5=9C=A8banking=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86=E4=B8=8A=E7=9A=84finetune=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

加入space模型在banking数据集上的微调代码
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006792
---
 modelscope/metainfo.py                        |   1 +
 modelscope/preprocessors/space/__init__.py    |   2 +
 modelscope/preprocessors/space/args.py        |  66 +++++++++
 modelscope/preprocessors/space/batch.py       |  55 +++++++
 modelscope/preprocessors/space/data_loader.py | 112 +++++++++++++++
 .../space/fields/intent_field.py              |   1 -
 .../preprocessors/space/lazy_dataset.py       |  47 ++++++
 modelscope/preprocessors/space/preprocess.py  |  48 +++++++
 modelscope/preprocessors/space/sampler.py     |  75 ++++++++++
 .../nlp/space/dialog_intent_trainer.py        | 134 ++++++++++++++++++
 tests/trainers/test_dialog_intent_trainer.py  | 101 +++++++++++++
 11 files changed, 641 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/preprocessors/space/args.py
 create mode 100644 modelscope/preprocessors/space/batch.py
 create mode 100644 modelscope/preprocessors/space/data_loader.py
 create mode 100644 modelscope/preprocessors/space/lazy_dataset.py
 create mode 100644 modelscope/preprocessors/space/preprocess.py
 create mode 100644 modelscope/preprocessors/space/sampler.py
 create mode 100644 modelscope/trainers/nlp/space/dialog_intent_trainer.py
 create mode 100644 tests/trainers/test_dialog_intent_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1bb2c389..e051bb76 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -241,6 +241,7 @@ class Trainers(object):
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
+    dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
 
diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/space/__init__.py
index f216287b..b484dabe 100644
--- a/modelscope/preprocessors/space/__init__.py
+++ b/modelscope/preprocessors/space/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .data_loader import DataLoader
     from .dialog_intent_prediction_preprocessor import \
         DialogIntentPredictionPreprocessor
     from .dialog_modeling_preprocessor import DialogModelingPreprocessor
@@ -13,6 +14,7 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'data_loader': ['DataLoader'],
         'dialog_intent_prediction_preprocessor':
         ['DialogIntentPredictionPreprocessor'],
         'dialog_modeling_preprocessor': ['DialogModelingPreprocessor'],
diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/space/args.py
new file mode 100644
index 00000000..d9e91e74
--- /dev/null
+++ b/modelscope/preprocessors/space/args.py
@@ -0,0 +1,66 @@
+"""
+Parse argument.
+"""
+
+import argparse
+
+import json
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Unsupported value encountered.')
+
+
+class HParams(dict):
+    """ Hyper-parameters class
+
+    Store hyper-parameters in training / infer / ... scripts.
+    """
+
+    def __getattr__(self, name):
+        if name in self.keys():
+            return self[name]
+        for v in self.values():
+            if isinstance(v, HParams):
+                if name in v:
+                    return v[name]
+        raise AttributeError(f"'HParams' object has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        self[name] = value
+
+    def save(self, filename):
+        with open(filename, 'w', encoding='utf-8') as fp:
+            json.dump(self, fp, ensure_ascii=False, indent=4, sort_keys=False)
+
+    def load(self, filename):
+        with open(filename, 'r', encoding='utf-8') as fp:
+            params_dict = json.load(fp)
+        for k, v in params_dict.items():
+            if isinstance(v, dict):
+                self[k].update(HParams(v))
+            else:
+                self[k] = v
+
+
+def parse_args(parser):
+    """ Parse hyper-parameters from cmdline. """
+    parsed = parser.parse_args()
+    args = HParams()
+    optional_args = parser._action_groups[1]
+    for action in optional_args._group_actions[1:]:
+        arg_name = action.dest
+        args[arg_name] = getattr(parsed, arg_name)
+    for group in parser._action_groups[2:]:
+        group_args = HParams()
+        for action in group._group_actions:
+            arg_name = action.dest
+            group_args[arg_name] = getattr(parsed, arg_name)
+        if len(group_args) > 0:
+            args[group.title] = group_args
+    return args
diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/space/batch.py
new file mode 100644
index 00000000..fe0ad0ec
--- /dev/null
+++ b/modelscope/preprocessors/space/batch.py
@@ -0,0 +1,55 @@
+def batch(reader, batch_size, drop_last=False):
+    """
+    This operator creates a batched reader which combines the data from the
+    input reader to batched data.
+
+    Args:
+        reader(generator): the data reader to read from.
+        batch_size(int): size of each mini-batch.
+        drop_last(bool, optional): If set to True, the last batch is dropped when
+            the size of last batch is not equal to batch_size, if set to False,
+            it will not. Default: False.
+    Returns:
+        The batched reader.
+
+    Return Type:
+        generator
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            def reader():
+                for i in range(10):
+                    yield i
+            batch_reader = fluid.io.batch(reader, batch_size=2)
+
+            for data in batch_reader():
+                print(data)
+
+            # Output is
+            # [0, 1]
+            # [2, 3]
+            # [4, 5]
+            # [6, 7]
+            # [8, 9]
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if drop_last is False and len(b) != 0:
+            yield b
+
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError('batch_size should be a positive integeral value, '
+                         'but got batch_size={}'.format(batch_size))
+
+    return batch_reader
diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/space/data_loader.py
new file mode 100644
index 00000000..bd04a79c
--- /dev/null
+++ b/modelscope/preprocessors/space/data_loader.py
@@ -0,0 +1,112 @@
+"""
+DataLoader class
+"""
+
+import math
+import os
+
+import numpy as np
+
+from modelscope.preprocessors.space.args import str2bool
+from modelscope.preprocessors.space.batch import batch
+from modelscope.preprocessors.space.lazy_dataset import LazyDataset
+from modelscope.preprocessors.space.sampler import (RandomSampler,
+                                                    SequentialSampler,
+                                                    SortedSampler)
+
+
+def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test):
+    assert os.path.exists(file), f"{file} doesn't exist"
+    dataset = LazyDataset(file, reader=reader)
+    data_loader = DataLoader(
+        dataset,
+        batch_size,
+        hparams.Trainer,
+        collate_fn=collate_fn,
+        is_test=is_test)
+    return data_loader
+
+
+def get_sequential_data_loader(batch_size, reader, hparams, data_paths,
+                               collate_fn, data_type):
+    data_loaders = []
+    for data_path in data_paths:
+        file = os.path.join(
+            data_path,
+            f'{data_type}.{hparams.BPETextField.tokenizer_type}.jsonl')
+        data_loaders.append(
+            get_data_loader(
+                batch_size=batch_size,
+                reader=reader,
+                hparams=hparams,
+                file=file,
+                collate_fn=collate_fn,
+                is_test=(data_type != 'train')))
+    data_loader = SequentialDataLoaderWrapper(data_loaders)
+    return data_loader
+
+
+class DataLoader(object):
+    """ Implement of DataLoader. """
+
+    @classmethod
+    def add_cmdline_argument(cls, group):
+        group.add_argument('--shuffle', type=str2bool, default=True)
+        group.add_argument('--sort_pool_size', type=int, default=0)
+        return group
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 hparams,
+                 collate_fn=None,
+                 sampler=None,
+                 is_test=False):
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.gpu = hparams.gpu
+        self.sort_pool_size = hparams.sort_pool_size
+
+        if sampler is None:
+            if hparams.shuffle and not is_test:
+                sampler = RandomSampler(dataset)
+            else:
+                sampler = SequentialSampler(dataset)
+
+        if self.sort_pool_size > 0 and not is_test:
+            sampler = SortedSampler(sampler, self.sort_pool_size)
+
+        def reader():
+            for idx in sampler:
+                yield idx
+
+        drop_last = False if self.gpu <= 1 or is_test else True
+        self.reader = batch(reader, batch_size=batch_size, drop_last=drop_last)
+        self.num_batches = math.floor(len(dataset) / batch_size) if drop_last \
+            else math.ceil(len(dataset) / batch_size)
+
+    def __len__(self):
+        return self.num_batches
+
+    def __iter__(self):
+        for batch_indices in self.reader():
+            samples = [self.dataset[idx] for idx in batch_indices]
+            yield self.collate_fn(samples)
+
+
+class SequentialDataLoaderWrapper:
+
+    def __init__(self, data_loaders):
+        self.data_loaders = data_loaders
+        self.data_file_to_dataset = {
+            data_loader.dataset.data_file: data_loader.dataset
+            for data_loader in self.data_loaders
+        }
+
+    def __iter__(self):
+        for data_loader in self.data_loaders:
+            for tmp_batch in data_loader:
+                yield data_loader.dataset.data_file, tmp_batch
+
+    def __len__(self):
+        return np.sum([len(data_loader) for data_loader in self.data_loaders])
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py
index dc00e677..6d3b5fff 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -791,7 +791,6 @@ class BPETextField(object):
                                 user_or_sys = [self.sos_r_id]
                             tmp = [self.sos_u_id
                                    ] + self.numericalize(s) + user_or_sys
-                            tmp = tmp + self.numericalize(s) + [self.eos_r_id]
                             new_src.append(tmp)
 
                         src_span_mask = [[0] + list(map(int, s)) + [0]
diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/space/lazy_dataset.py
new file mode 100644
index 00000000..8da21db7
--- /dev/null
+++ b/modelscope/preprocessors/space/lazy_dataset.py
@@ -0,0 +1,47 @@
+"""
+Dataset class
+"""
+
+import json
+
+from modelscope.preprocessors.space.args import str2bool
+
+
+class LazyDataset(object):
+    """
+    Lazy load dataset from disk.
+
+    Each line of data file is a preprocessed example.
+    """
+
+    def __init__(self, data_file, reader, transform=lambda s: json.loads(s)):
+        """
+        Initialize lazy dataset.
+
+        By default, loading .jsonl format.
+
+        :param data_file
+        :type str
+
+        :param transform
+        :type callable
+        """
+        self.data_file = data_file
+        self.transform = transform
+        self.reader = reader
+        self.offsets = [0]
+        with open(data_file, 'r', encoding='utf-8') as fp:
+            while fp.readline() != '':
+                self.offsets.append(fp.tell())
+        self.offsets.pop()
+        self.fp = open(data_file, 'r', encoding='utf-8')
+
+    def __len__(self):
+        return len(self.offsets)
+
+    def __getitem__(self, idx):
+        self.fp.seek(self.offsets[idx], 0)
+        sample = self.transform(self.fp.readline().strip())
+        if self.reader.with_mlm:
+            sample = self.reader.create_token_masked_lm_predictions(sample)
+        return sample
diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/space/preprocess.py
new file mode 100644
index 00000000..bd8d64d1
--- /dev/null
+++ b/modelscope/preprocessors/space/preprocess.py
@@ -0,0 +1,48 @@
+"""
+Preprocess script.
+"""
+
+import glob
+import os
+
+from modelscope.preprocessors.space.args import parse_args
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+
+FILE_NAME = 'train.json'
+
+
+def intent_preprocess(path, cfg):
+
+    bpe = IntentBPETextField(path, cfg)
+    args = cfg.Dataset
+    build_examples_fn = bpe.build_examples_multi_turn if args.trigger_role == 'system' \
+        else bpe.build_examples_single_turn
+    build_score_matrix_fn = bpe.build_score_matrix
+    build_score_matrix_multiprocessing_fn = bpe.build_score_matrix_multiprocessing
+    data_paths = list(
+        os.path.dirname(c) for c in sorted(
+            glob.glob(args.data_dir + '/**/' + FILE_NAME, recursive=True)))
+    data_paths = bpe.filter_data_path(data_paths=data_paths)
+
+    for mode in ['train', 'valid', 'test']:
+        for data_path in data_paths:
+            input_file = os.path.join(data_path, f'{mode}.json')
+            output_file = os.path.join(data_path,
+                                       f'{mode}.{bpe.tokenizer_type}.jsonl')
+            output_score_file = os.path.join(data_path, f'{mode}.Score.npy')
+            if os.path.exists(input_file) and not os.path.exists(output_file):
+                examples = build_examples_fn(input_file, data_type=mode)
+                if examples:
+                    bpe.save_examples(examples, output_file)
+                else:
+                    continue
+            if os.path.exists(output_file) and not os.path.exists(output_score_file) and \
+                    not args.dynamic_score and 'AnPreDial' in data_path:
+                examples = bpe.load_examples(output_file)
+                if args.num_process >= 2:
+                    score_matrix = build_score_matrix_multiprocessing_fn(
+                        examples)
+                else:
+                    score_matrix = build_score_matrix_fn(examples)
+                bpe.save_examples(score_matrix, output_score_file)
diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/space/sampler.py
new file mode 100644
index 00000000..49a216d1
--- /dev/null
+++ b/modelscope/preprocessors/space/sampler.py
@@ -0,0 +1,75 @@
+"""
+Sampler class.
+"""
+
+import numpy as np
+
+
+class Sampler(object):
+
+    def __init__(self):
+        return
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        return iter(range(len(self)))
+
+
+class RandomSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.epoch = 0
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        np.random.seed(self.epoch)
+        self.epoch += 1
+        return iter(np.random.permutation(len(self)))
+
+
+class SortedSampler(Sampler):
+    """ Sorted Sampler.
+    Sort each block of examples by key.
+    """
+
+    def __init__(self, sampler, sort_pool_size, key='src'):
+        self.sampler = sampler
+        self.sort_pool_size = sort_pool_size
+        self.key = lambda idx: len(self.sampler.dataset[idx][key])
+        return
+
+    def __len__(self):
+        return len(self.sampler)
+
+    def __iter__(self):
+        pool = []
+        for idx in self.sampler:
+            pool.append(idx)
+            if len(pool) == self.sort_pool_size:
+                pool = sorted(pool, key=self.key)
+                for i in pool:
+                    yield i
+                pool = []
+        if len(pool) > 0:
+            pool = sorted(pool, key=self.key)
+            for i in pool:
+                yield i
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
new file mode 100644
index 00000000..515cd46d
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -0,0 +1,134 @@
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import Generator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.data_loader import \
+    get_sequential_data_loader
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+from modelscope.preprocessors.space.preprocess import intent_preprocess
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer
+from modelscope.utils.config import Config
+from modelscope.utils.logger import get_logger
+
+PATH = None
+logger = get_logger(PATH)
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_intent_trainer)
+class DialogIntentTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda() if self.cfg.use_gpu else array
+
+        def setup_seed(seed):
+            import random
+            import torch
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+            torch.backends.cudnn.deterministic = True
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # preprocess data
+        intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg)
+        # set reader and evaluator
+        bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
+
+        self.cfg.Model.num_token_embeddings = bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1
+        dataset_paths = [
+            os.path.join(self.cfg.Dataset.data_dir,
+                         self.cfg.Dataset.trigger_data)
+        ]
+        # set data and data status
+        collate_fn = bpe.collate_fn_multi_turn
+        self.train_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='train')
+        self.valid_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='valid')
+        self.test_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='test')
+
+        # set generator
+        generator = Generator.create(self.cfg, reader=bpe)
+        # construct model
+        self.model = SpaceModelBase.create(
+            self.cfg.Model.init_checkpoint,
+            self.cfg,
+            reader=bpe,
+            generator=generator)
+
+        import torch
+
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = IntentTrainer(
+            self.model, to_tensor, self.cfg, reader=bpe)
+        num_batches = len(self.train_label_loader)
+        self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches)
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(
+            train_label_iter=self.train_label_loader,
+            valid_label_iter=self.valid_label_loader)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.trainer.infer(
+            data_iter=self.test_label_loader,
+            ex_data_iter=self.train_label_loader)
diff --git a/tests/trainers/test_dialog_intent_trainer.py b/tests/trainers/test_dialog_intent_trainer.py
new file mode 100644
index 00000000..b183a690
--- /dev/null
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -0,0 +1,101 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DownloadMode, ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogIntentTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.save_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.save_dir):
+            os.mkdir(self.save_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.save_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        model_id = 'damo/nlp_space_pretrained-dialog-model'
+        data_banking = MsDataset.load('banking77')
+        self.data_dir = data_banking._hf_ds.config_kwargs['split_config'][
+            'train']
+        self.model_dir = snapshot_download(model_id)
+        self.debugging = True
+        kwargs = dict(
+            model_dir=self.model_dir,
+            cfg_name='intent_train_config.json',
+            cfg_modify_fn=self.cfg_modify_fn)
+        trainer = build_trainer(
+            name=Trainers.dialog_intent_trainer, default_args=kwargs)
+        trainer.train()
+
+    def cfg_modify_fn(self, cfg):
+        config = {
+            'num_intent': 77,
+            'BPETextField': {
+                'vocab_path': '',
+                'data_name': 'banking77',
+                'data_root': self.data_dir,
+                'understand': True,
+                'generation': False,
+                'max_len': 256
+            },
+            'Dataset': {
+                'data_dir': self.data_dir,
+                'with_contrastive': False,
+                'trigger_role': 'user',
+                'trigger_data': 'banking'
+            },
+            'Trainer': {
+                'can_norm': True,
+                'seed': 11,
+                'gpu': 1,
+                'save_dir': self.save_dir,
+                'batch_size_label': 128,
+                'batch_size_nolabel': 0,
+                'log_steps': 20
+            },
+            'Model': {
+                'init_checkpoint': self.model_dir,
+                'model': 'IntentUnifiedTransformer',
+                'example': False,
+                'num_intent': 77,
+                'with_rdrop': True,
+                'num_turn_embeddings': 21,
+                'dropout': 0.25,
+                'kl_ratio': 5.0,
+                'embed_dropout': 0.25,
+                'attn_dropout': 0.25,
+                'ff_dropout': 0.25,
+                'with_pool': False,
+                'warmup_steps': -1
+            }
+        }
+        cfg.BPETextField.vocab_path = os.path.join(self.model_dir,
+                                                   ModelFile.VOCAB_FILE)
+        cfg.num_intent = 77
+        cfg.Trainer.update(config['Trainer'])
+        cfg.BPETextField.update(config['BPETextField'])
+        cfg.Dataset.update(config['Dataset'])
+        cfg.Model.update(config['Model'])
+        if self.debugging:
+            cfg.Trainer.save_checkpoint = False
+            cfg.Trainer.num_epochs = 5
+            cfg.Trainer.batch_size_label = 64
+        return cfg
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7a49fa1cc6d2029650310cb1745600a526b08dc9 Mon Sep 17 00:00:00 2001
From: "lingcai.wl" <lingcai.wl@alibaba-inc.com>
Date: Thu, 8 Sep 2022 14:08:51 +0800
Subject: [PATCH 081/175] [to #44657982] add unittest for demo and demotest
 utils

unittest for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006180
---
 modelscope/utils/constant.py                  |  16 ++
 modelscope/utils/demo_utils.py                | 243 ++++++++++++++++++
 tests/pipelines/test_action_detection.py      |  17 +-
 tests/pipelines/test_action_recognition.py    |  19 +-
 tests/pipelines/test_animal_recognition.py    |  14 +-
 .../test_automatic_speech_recognition.py      |   9 +-
 tests/pipelines/test_body_2d_keypoints.py     |  16 +-
 tests/pipelines/test_body_3d_keypoints.py     |  10 +-
 .../pipelines/test_cmdssl_video_embedding.py  |  14 +-
 .../test_conversational_text_to_sql.py        |  30 +--
 tests/pipelines/test_crowd_counting.py        |  14 +-
 tests/pipelines/test_csanmt_translation.py    |  18 +-
 .../test_dialog_intent_prediction.py          |  17 +-
 tests/pipelines/test_dialog_modeling.py       |  18 +-
 tests/pipelines/test_dialog_state_tracking.py |  22 +-
 tests/pipelines/test_document_segmentation.py |  15 +-
 tests/pipelines/test_face_detection.py        |  10 +-
 tests/pipelines/test_face_image_generation.py |  12 +-
 tests/pipelines/test_face_recognition.py      |   8 +-
 .../pipelines/test_faq_question_answering.py  |  13 +-
 tests/pipelines/test_fill_mask.py             |  12 +-
 .../test_general_image_classification.py      |  12 +-
 tests/pipelines/test_general_recognition.py   |  11 +-
 .../test_generative_multi_modal_embedding.py  |  13 +-
 .../pipelines/test_hicossl_video_embedding.py |   8 +-
 tests/pipelines/test_image_color_enhance.py   |   8 +-
 tests/pipelines/test_image_colorization.py    |   8 +-
 tests/pipelines/test_image_denoise.py         |  13 +-
 .../test_image_instance_segmentation.py       |  13 +-
 tests/pipelines/test_image_matting.py         |   9 +-
 .../test_image_panoptic_segmentation.py       |  17 +-
 .../test_image_portrait_enhancement.py        |   8 +-
 tests/pipelines/test_image_reid_person.py     |   8 +-
 .../test_image_semantic_segmentation.py       |  18 +-
 tests/pipelines/test_image_style_transfer.py  |   8 +-
 .../pipelines/test_image_super_resolution.py  |   8 +-
 tests/pipelines/test_key_word_spotting.py     |   7 +-
 tests/pipelines/test_live_category.py         |  14 +-
 .../test_movie_scene_segmentation.py          |  14 +-
 tests/pipelines/test_mplug_tasks.py           |  11 +-
 tests/pipelines/test_multi_modal_embedding.py |  13 +-
 .../test_named_entity_recognition.py          |  12 +-
 tests/pipelines/test_nli.py                   |  13 +-
 tests/pipelines/test_object_detection.py      |  11 +-
 tests/pipelines/test_ocr_detection.py         |   8 +-
 tests/pipelines/test_ocr_recognition.py       |  15 +-
 tests/pipelines/test_ofa_tasks.py             |   7 +-
 tests/pipelines/test_person_image_cartoon.py  |   8 +-
 .../test_product_retrieval_embedding.py       |  13 +-
 .../test_realtime_object_detection.py         |  10 +-
 tests/pipelines/test_relation_extraction.py   |  15 +-
 tests/pipelines/test_salient_detection.py     |  11 +-
 tests/pipelines/test_sentence_similarity.py   |  13 +-
 .../test_sentiment_classification.py          |  14 +-
 tests/pipelines/test_skin_retouching.py       |   8 +-
 tests/pipelines/test_speech_signal_process.py |   9 +-
 tests/pipelines/test_text_classification.py   |  10 +-
 .../test_text_driven_segmentation.py          |   4 +
 tests/pipelines/test_text_error_correction.py |  13 +-
 tests/pipelines/test_text_generation.py       |   7 +-
 .../pipelines/test_text_to_image_synthesis.py |  13 +-
 tests/pipelines/test_text_to_speech.py        |  16 +-
 .../pipelines/test_tinynas_classification.py  |  11 +-
 tests/pipelines/test_tinynas_detection.py     |   4 +
 tests/pipelines/test_video_category.py        |  14 +-
 .../test_video_multi_modal_embedding.py       |  12 +-
 .../test_video_single_object_tracking.py      |   8 +-
 tests/pipelines/test_video_summarization.py   |  14 +-
 tests/pipelines/test_virtual_try_on.py        |  13 +-
 tests/pipelines/test_word_segmentation.py     |  14 +-
 .../test_zero_shot_classification.py          |  13 +-
 71 files changed, 913 insertions(+), 188 deletions(-)
 create mode 100644 modelscope/utils/demo_utils.py

diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 8fb00ed6..6d84925c 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -136,6 +136,22 @@ class MultiModalTasks(object):
     image_text_retrieval = 'image-text-retrieval'
 
 
+class TasksIODescriptions(object):
+    image_to_image = 'image_to_image',
+    images_to_image = 'images_to_image',
+    image_to_text = 'image_to_text',
+    seed_to_image = 'seed_to_image',
+    text_to_speech = 'text_to_speech',
+    text_to_text = 'text_to_text',
+    speech_to_text = 'speech_to_text',
+    speech_to_speech = 'speech_to_speech'
+    speeches_to_speech = 'speeches_to_speech',
+    visual_grounding = 'visual_grounding',
+    visual_question_answering = 'visual_question_answering',
+    visual_entailment = 'visual_entailment',
+    generative_multi_modal_embedding = 'generative_multi_modal_embedding'
+
+
 class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
     """ Names for tasks supported by modelscope.
 
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
new file mode 100644
index 00000000..0f8378cd
--- /dev/null
+++ b/modelscope/utils/demo_utils.py
@@ -0,0 +1,243 @@
+import io
+
+import cv2
+import json
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks, TasksIODescriptions
+
+TASKS_INPUT_TEMPLATES = {
+    # vision tasks
+    Tasks.image_portrait_stylization: TasksIODescriptions.image_to_image,
+    Tasks.portrait_matting: TasksIODescriptions.image_to_image,
+    Tasks.skin_retouching: TasksIODescriptions.image_to_image,
+    Tasks.image_captioning: TasksIODescriptions.image_to_text,
+    Tasks.image_denoising: TasksIODescriptions.image_to_image,
+    Tasks.image_portrait_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.image_super_resolution: TasksIODescriptions.image_to_image,
+    Tasks.image_colorization: TasksIODescriptions.image_to_image,
+    Tasks.image_color_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.face_image_generation: TasksIODescriptions.seed_to_image,
+    Tasks.image_style_transfer: TasksIODescriptions.images_to_image,
+    Tasks.image_segmentation: TasksIODescriptions.image_to_text,
+    Tasks.image_object_detection: TasksIODescriptions.image_to_text,
+
+    # not tested
+    Tasks.image_classification: TasksIODescriptions.image_to_text,
+    Tasks.ocr_detection: TasksIODescriptions.image_to_text,
+    Tasks.ocr_recognition: TasksIODescriptions.image_to_text,
+    Tasks.body_2d_keypoints: TasksIODescriptions.image_to_text,
+
+    # nlp tasks
+    Tasks.text_classification: TasksIODescriptions.text_to_text,
+    Tasks.text_generation: TasksIODescriptions.text_to_text,
+    Tasks.word_segmentation: TasksIODescriptions.text_to_text,
+    Tasks.text_error_correction: TasksIODescriptions.text_to_text,
+    Tasks.named_entity_recognition: TasksIODescriptions.text_to_text,
+    Tasks.sentiment_classification: TasksIODescriptions.text_to_text,
+
+    # audio tasks
+    Tasks.text_to_speech: TasksIODescriptions.text_to_speech,
+    Tasks.auto_speech_recognition: TasksIODescriptions.speech_to_text,
+    Tasks.keyword_spotting: TasksIODescriptions.speech_to_text,
+    Tasks.acoustic_noise_suppression: TasksIODescriptions.speech_to_speech,
+    Tasks.acoustic_echo_cancellation: TasksIODescriptions.speeches_to_speech,
+
+    # multi-modal
+    Tasks.visual_grounding: TasksIODescriptions.visual_grounding,
+    Tasks.visual_question_answering:
+    TasksIODescriptions.visual_question_answering,
+    Tasks.visual_entailment: TasksIODescriptions.visual_entailment,
+    Tasks.generative_multi_modal_embedding:
+    TasksIODescriptions.generative_multi_modal_embedding,
+
+    # new tasks
+    Tasks.virtual_try_on: TasksIODescriptions.images_to_image,
+
+    # TODO(lingcai.wl): support more tasks and implement corresponding example
+}
+
+INPUT_EXAMPLES = {
+    # Must align with task schema defined in the Widget section of model card=
+    # cv
+    TasksIODescriptions.image_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.images_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_content.jpg',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_style.jpg'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.image_to_text: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {}
+    },
+    # nlp
+    TasksIODescriptions.text_to_text: {
+        'inputs': ['test'],
+        'urlPaths': {}
+    },
+
+    # audio
+    TasksIODescriptions.speech_to_text: {
+        'inputs': [
+            'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.text_to_speech: {
+        'inputs': ['北京今天天气怎么样'],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'pcm'
+            }]
+        }
+    },
+    TasksIODescriptions.speeches_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_mic.wav',
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_speech.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'wav'
+            }]
+        }
+    },
+    TasksIODescriptions.speech_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/speech_with_noise.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'wav'
+            }]
+        }
+    },
+
+    # multi modal
+    TasksIODescriptions.visual_grounding: {
+        'task':
+        Tasks.visual_grounding,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
+            'a blue turtle-like pokemon with round head'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.visual_question_answering: {
+        'task':
+        Tasks.visual_question_answering,
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
+            'what is grown on the plant?'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.visual_entailment: {
+        'task':
+        Tasks.visual_entailment,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
+            'there are two birds.', 'test'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.generative_multi_modal_embedding: {
+        'task':
+        Tasks.generative_multi_modal_embedding,
+        'inputs': [
+            'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
+            'dogs playing in the grass'
+        ],
+        'urlPaths': {}
+    },
+}
+
+
+class DemoCompatibilityCheck(object):
+
+    def compatibility_check(self):
+        if self.task not in TASKS_INPUT_TEMPLATES:
+            print('task is not supported in demo service so far')
+            return False
+        if TASKS_INPUT_TEMPLATES[self.task] not in INPUT_EXAMPLES:
+            print('no example input for this task')
+            return False
+
+        print('testing demo: ', self.task, self.model_id)
+        test_pipline = pipeline(self.task, self.model_id)
+        req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
+        output = test_pipline(preprocess(req))
+        json.dumps(output, cls=NumpyEncoder)
+        result = postprocess(req, output)
+        print(result)
+        return True
+
+
+class NumpyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+
+        if isinstance(obj, np.floating):
+            return float(obj)
+
+        if isinstance(obj, np.integer):
+            return int(obj)
+
+        return json.JSONEncoder.default(self, obj)
+
+
+def preprocess(req):
+    if len(req['inputs']) == 1:
+        inputs = req['inputs'][0]
+    else:
+        inputs = tuple(req['inputs'])
+    return inputs
+
+
+def postprocess(req, resp):
+    out_urls = req.get('urlPaths').get('outUrls')
+    if out_urls is None or len(out_urls) == 0:
+        return resp
+    new_resp = resp
+    if isinstance(resp, str):
+        new_resp = json.loads(resp)
+    for out_url in out_urls:
+        output_key = out_url['outputKey']
+        file_type = out_url['fileType']
+        new_resp.get(output_key)
+        if file_type == 'png' or file_type == 'jpg':
+            content = new_resp.get(output_key)
+            _, img_encode = cv2.imencode('.' + file_type, content)
+            img_bytes = img_encode.tobytes()
+            return type(img_bytes)
+        elif file_type == 'wav':
+            out_mem_file = io.BytesIO()
+            out_mem_file.write(new_resp.get(output_key))
+            return type(out_mem_file)
+        # TODO(lingcai.wl): support more file type
diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py
index c752dc78..ae7e60b1 100644
--- a/tests/pipelines/test_action_detection.py
+++ b/tests/pipelines/test_action_detection.py
@@ -2,21 +2,28 @@
 import unittest
 
 from modelscope.pipelines import pipeline
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ActionDetectionTest(unittest.TestCase):
+class ActionDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.action_detection
+        self.model_id = 'damo/cv_ResNetC3D_action-detection_detection2d'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
-        action_detection_pipline = pipeline(
-            Tasks.action_detection,
-            model='damo/cv_ResNetC3D_action-detection_detection2d')
+        action_detection_pipline = pipeline(self.task, model=self.model_id)
         result = action_detection_pipline(
             'data/test/videos/action_detection_test_video.mp4')
         print('action detection results:', result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
index e955eb60..b9548630 100644
--- a/tests/pipelines/test_action_recognition.py
+++ b/tests/pipelines/test_action_recognition.py
@@ -1,24 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# !/usr/bin/env python
-import os.path as osp
-import tempfile
 import unittest
 
-from modelscope.fileio import File
 from modelscope.pipelines import pipeline
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ActionRecognitionTest(unittest.TestCase):
+class ActionRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.action_recognition
         self.model_id = 'damo/cv_TAdaConv_action-recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        recognition_pipeline = pipeline(
-            Tasks.action_recognition, model=self.model_id)
+        recognition_pipeline = pipeline(self.task, self.model_id)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
@@ -26,12 +23,16 @@ class ActionRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
-        recognition_pipeline = pipeline(Tasks.action_recognition)
+        recognition_pipeline = pipeline(self.task)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'recognition output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_animal_recognition.py b/tests/pipelines/test_animal_recognition.py
index 3a31afed..7d5f0561 100644
--- a/tests/pipelines/test_animal_recognition.py
+++ b/tests/pipelines/test_animal_recognition.py
@@ -2,19 +2,27 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class AnimalRecognitionTest(unittest.TestCase):
+class AnimalRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.animal_recognition
+        self.model_id = 'damo/cv_resnest101_animal_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         animal_recognition = pipeline(
-            Tasks.animal_recognition,
-            model='damo/cv_resnest101_animal_recognition')
+            Tasks.animal_recognition, model=self.model_id)
         result = animal_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 7f4ce88e..3c4327be 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -10,6 +10,7 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
@@ -22,7 +23,8 @@ LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz'
 LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz'
 
 
-class AutomaticSpeechRecognitionTest(unittest.TestCase):
+class AutomaticSpeechRecognitionTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav_pytorch': {
             'checking_item': OutputKeys.TEXT,
@@ -74,6 +76,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
         self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1'
         # this temporary workspace dir will store waveform files
         self.workspace = os.path.join(os.getcwd(), '.tmp')
+        self.task = Tasks.auto_speech_recognition
         if not os.path.exists(self.workspace):
             os.mkdir(self.workspace)
 
@@ -254,6 +257,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_2d_keypoints.py b/tests/pipelines/test_body_2d_keypoints.py
index d010adc5..5d90cbf0 100644
--- a/tests/pipelines/test_body_2d_keypoints.py
+++ b/tests/pipelines/test_body_2d_keypoints.py
@@ -2,20 +2,20 @@
 import unittest
 
 import cv2
-import numpy as np
 from PIL import Image
 
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_keypoints
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class Body2DKeypointsTest(unittest.TestCase):
+class Body2DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.body_2d_keypoints
         self.model_id = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
 
@@ -26,16 +26,18 @@ class Body2DKeypointsTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_with_image_file(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, self.test_image)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub_with_image_input(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, Image.open(self.test_image))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 50426414..9dce0d19 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -1,23 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import pdb
 import unittest
 
 import cv2
 import numpy as np
-from PIL import Image
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class Body3DKeypointsTest(unittest.TestCase):
+class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_canonical_body-3d-keypoints_video'
         self.test_video = 'data/test/videos/Walking.54138969.mp4'
+        self.task = Tasks.body_3d_keypoints
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
         output = pipeline(pipeline_input)
@@ -44,6 +44,10 @@ class Body3DKeypointsTest(unittest.TestCase):
         body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
         self.pipeline_inference(body_3d_keypoints, self.test_video)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 694ebf40..2a4cade1 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -4,20 +4,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class CMDSSLVideoEmbeddingTest(unittest.TestCase):
+class CMDSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_embedding
+        self.model_id = 'damo/cv_r2p1d_video_embedding'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        videossl_pipeline = pipeline(
-            Tasks.video_embedding, model='damo/cv_r2p1d_video_embedding')
+        videossl_pipeline = pipeline(task=self.task, model=self.model_id)
         result = videossl_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'video embedding output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 0504cb7c..80c72337 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
-from typing import List
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
@@ -9,11 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
-class ConversationalTextToSql(unittest.TestCase):
+class ConversationalTextToSql(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.conversational_text_to_sql
+        self.model_id = 'damo/nlp_star_conversational-text-to-sql'
+
     model_id = 'damo/nlp_star_conversational-text-to-sql'
     test_case = {
         'database_id':
@@ -39,10 +44,7 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
@@ -55,26 +57,24 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=Tasks.conversational_text_to_sql, model=self.model_id)
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipelines = [pipeline(task=Tasks.conversational_text_to_sql)]
+        pipelines = [pipeline(task=self.task)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_crowd_counting.py b/tests/pipelines/test_crowd_counting.py
index 99f5ffd2..4e15cfca 100644
--- a/tests/pipelines/test_crowd_counting.py
+++ b/tests/pipelines/test_crowd_counting.py
@@ -8,17 +8,19 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import numpy_to_cv2img
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class CrowdCountingTest(unittest.TestCase):
+class CrowdCountingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/crowd_counting.jpg'
         self.model_id = 'damo/cv_hrnet_crowd-counting_dcanet'
+        self.task = Tasks.crowd_counting
 
     def save_result(self, result):
         print('scores:', result[OutputKeys.SCORES])
@@ -28,7 +30,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_crowd_counting(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
@@ -37,7 +39,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_crowd_counting_with_image(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         img = Image.open(self.input_location)
         result = crowd_counting(img)
         if result:
@@ -47,13 +49,17 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_crowd_counting_with_default_task(self):
-        crowd_counting = pipeline(Tasks.crowd_counting)
+        crowd_counting = pipeline(self.task)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index bb6022ec..f7ec81cd 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -3,31 +3,39 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TranslationTest(unittest.TestCase):
+class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.translation
+        self.model_id = 'damo/nlp_csanmt_translation_zh2en'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_zh2en(self):
-        model_id = 'damo/nlp_csanmt_translation_zh2en'
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
-        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        pipeline_ins = pipeline(self.task, model=self.model_id)
         print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_en2zh(self):
         model_id = 'damo/nlp_csanmt_translation_en2zh'
         inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
-        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
-        pipeline_ins = pipeline(task=Tasks.translation)
+        pipeline_ins = pipeline(self.task)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
index afd68442..5894297f 100644
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogIntentPredictionPipeline
 from modelscope.preprocessors import DialogIntentPredictionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogIntentPredictionTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-intent-prediction'
+class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-intent-prediction'
+
     test_case = [
         'How do I locate my card?',
         'I still have not received my new card, I ordered over a week ago.'
@@ -61,13 +66,15 @@ class DialogIntentPredictionTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         for my_pipeline, item in list(zip(pipelines, self.test_case)):
             print(my_pipeline(item))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
index 299af2e9..19d6ed2f 100644
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -10,11 +10,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogModelingPipeline
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogModelingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-modeling'
+class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-modeling'
+
     test_case = {
         'sng0073': {
             'goal': {
@@ -139,7 +144,7 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
+                task=self.task,
                 model=self.model_id,
                 model_revision='task_oriented_conversation')
         ]
@@ -149,11 +154,14 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_default_model(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model_revision='task_oriented_conversation')
+                task=self.task, model_revision='task_oriented_conversation')
         ]
         self.generate_and_print_dialog_response(pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 843aade9..81bdd9be 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
-class DialogStateTrackingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-state-tracking'
+class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-state-tracking'
+
     test_case = [{
         'User-1':
         'Hi, I\'m looking for a train that is going to cambridge and arriving there by 20:45, '
@@ -103,10 +108,7 @@ class DialogStateTrackingTest(unittest.TestCase):
         pipelines = [
             DialogStateTrackingPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
 
         tracking_and_print_dialog_states(self.test_case, pipelines)
@@ -115,12 +117,14 @@ class DialogStateTrackingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         tracking_and_print_dialog_states(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_document_segmentation.py b/tests/pipelines/test_document_segmentation.py
index 39609be8..b4406fef 100644
--- a/tests/pipelines/test_document_segmentation.py
+++ b/tests/pipelines/test_document_segmentation.py
@@ -6,13 +6,18 @@ from typing import Any, Dict
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class DocumentSegmentationTest(unittest.TestCase):
+class DocumentSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_segmentation
+        self.model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
 
     model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
     eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
@@ -21,10 +26,8 @@ class DocumentSegmentationTest(unittest.TestCase):
     eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (1220–1263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.'  # noqa *
 
     def run_pipeline(self, model_id: str, documents: str) -> Dict[str, Any]:
-        p = pipeline(task=Tasks.document_segmentation, model=model_id)
-
+        p = pipeline(task=self.task, model=model_id)
         result = p(documents=documents)
-
         return result
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -51,6 +54,10 @@ class DocumentSegmentationTest(unittest.TestCase):
         for document in documents_list:
             print(document)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index 03dd75a6..f89e9a94 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -3,19 +3,19 @@ import os.path as osp
 import unittest
 
 import cv2
-import numpy as np
 
 from modelscope.msdatasets import MsDataset
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceDetectionTest(unittest.TestCase):
+class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_detection
         self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
 
     def show_result(self, img_path, detection_result):
@@ -49,6 +49,10 @@ class FaceDetectionTest(unittest.TestCase):
         result = face_detection(img_path)
         self.show_result(img_path, result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_image_generation.py b/tests/pipelines/test_face_image_generation.py
index c758ea3a..21d8e835 100644
--- a/tests/pipelines/test_face_image_generation.py
+++ b/tests/pipelines/test_face_image_generation.py
@@ -8,12 +8,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceGenerationTest(unittest.TestCase):
+class FaceGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_image_generation
         self.model_id = 'damo/cv_gan_face-image-generation'
 
     def pipeline_inference(self, pipeline: Pipeline, seed: int):
@@ -26,7 +28,7 @@ class FaceGenerationTest(unittest.TestCase):
     def test_run_modelhub(self):
         seed = 10
         face_generation = pipeline(
-            Tasks.face_image_generation,
+            self.task,
             model=self.model_id,
         )
         self.pipeline_inference(face_generation, seed)
@@ -34,9 +36,13 @@ class FaceGenerationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         seed = 10
-        face_generation = pipeline(Tasks.face_image_generation)
+        face_generation = pipeline(self.task)
         self.pipeline_inference(face_generation, seed)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_recognition.py b/tests/pipelines/test_face_recognition.py
index 015205d6..d3451f5d 100644
--- a/tests/pipelines/test_face_recognition.py
+++ b/tests/pipelines/test_face_recognition.py
@@ -6,12 +6,14 @@ import numpy as np
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceRecognitionTest(unittest.TestCase):
+class FaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_recognition
         self.model_id = 'damo/cv_ir101_facerecognition_cfglint'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -26,6 +28,10 @@ class FaceRecognitionTest(unittest.TestCase):
         sim = np.dot(emb1[0], emb2[0])
         print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 3a87643c..7eea0ddf 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -11,11 +11,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
 from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaqQuestionAnsweringTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.faq_question_answering
+        self.model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+
     param = {
         'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
         'support_set': [{
@@ -80,6 +85,10 @@ class FaqQuestionAnsweringTest(unittest.TestCase):
             ['今天星期六', '明天星期几明天星期几'])
         print(np.shape(sentence_vec))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 6b37f6df..cec8966f 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -9,11 +9,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class FillMaskTest(unittest.TestCase):
+class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.fill_mask
+        self.model_id = 'damo/nlp_veco_fill-mask-large'
+
     model_id_sbert = {
         'zh': 'damo/nlp_structbert_fill-mask_chinese-large',
         'en': 'damo/nlp_structbert_fill-mask_english-large'
@@ -134,6 +140,10 @@ class FillMaskTest(unittest.TestCase):
         print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
               f'{pipeline_ins(test_input)}\n')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_general_image_classification.py b/tests/pipelines/test_general_image_classification.py
index 8a814f4a..b35f3696 100644
--- a/tests/pipelines/test_general_image_classification.py
+++ b/tests/pipelines/test_general_image_classification.py
@@ -2,10 +2,16 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralImageClassificationTest(unittest.TestCase):
+class GeneralImageClassificationTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_ImageNet(self):
@@ -29,6 +35,10 @@ class GeneralImageClassificationTest(unittest.TestCase):
         result = general_image_classification('data/test/images/bird.JPEG')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_general_recognition.py b/tests/pipelines/test_general_recognition.py
index 0b32e1f5..cbcb927b 100644
--- a/tests/pipelines/test_general_recognition.py
+++ b/tests/pipelines/test_general_recognition.py
@@ -2,10 +2,15 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralRecognitionTest(unittest.TestCase):
+class GeneralRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.general_recognition
+        self.model_id = 'damo/cv_resnest101_general_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -15,6 +20,10 @@ class GeneralRecognitionTest(unittest.TestCase):
         result = general_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index d8593abb..464c0d36 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -5,11 +5,16 @@ import unittest
 from modelscope.models import Model
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GEMMMultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+class GEMMMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.generative_multi_modal_embedding
+        self.model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+
     test_input = {
         'image': 'data/test/images/generative_multimodal.jpg',
         'text':
@@ -63,6 +68,10 @@ class GEMMMultiModalEmbeddingTest(unittest.TestCase):
         output = generative_multi_modal_embedding_pipeline(test_input)
         print(output)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
index 5615cef2..dea2e020 100644
--- a/tests/pipelines/test_hicossl_video_embedding.py
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -4,12 +4,14 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class HICOSSLVideoEmbeddingTest(unittest.TestCase):
+class HICOSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.video_embedding
         self.model_id = 'damo/cv_s3dg_video-embedding'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -21,6 +23,10 @@ class HICOSSLVideoEmbeddingTest(unittest.TestCase):
 
         print(f'video embedding output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py
index c8ea5f9c..9b72999e 100644
--- a/tests/pipelines/test_image_color_enhance.py
+++ b/tests/pipelines/test_image_color_enhance.py
@@ -8,13 +8,15 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorEnhanceTest(unittest.TestCase):
+class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
+        self.task = Tasks.image_color_enhancement
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -36,6 +38,10 @@ class ImageColorEnhanceTest(unittest.TestCase):
         self.pipeline_inference(img_color_enhance,
                                 'data/test/images/image_color_enhance.png')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_colorization.py b/tests/pipelines/test_image_colorization.py
index 1a02cffb..a4b132ab 100644
--- a/tests/pipelines/test_image_colorization.py
+++ b/tests/pipelines/test_image_colorization.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorizationTest(unittest.TestCase):
+class ImageColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-colorization'
         self.test_image = 'data/test/images/marilyn_monroe_4.jpg'
+        self.task = Tasks.image_colorization
 
     def pipeline_inference(self, pipeline: Pipeline, test_image: str):
         result = pipeline(test_image)
@@ -35,6 +37,10 @@ class ImageColorizationTest(unittest.TestCase):
         image_colorization = pipeline(Tasks.image_colorization)
         self.pipeline_inference(image_colorization, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index d3e0af24..4a9df462 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -10,11 +10,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.cv import ImageDenoisePipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageDenoiseTest(unittest.TestCase):
-    model_id = 'damo/cv_nafnet_image-denoise_sidd'
+class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_denoising
+        self.model_id = 'damo/cv_nafnet_image-denoise_sidd'
+
     demo_image_path = 'data/test/images/noisy-demo-1.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageDenoiseTest(unittest.TestCase):
         w, h = denoise_img.size
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_instance_segmentation.py b/tests/pipelines/test_image_instance_segmentation.py
index cd08d669..520bc99c 100644
--- a/tests/pipelines/test_image_instance_segmentation.py
+++ b/tests/pipelines/test_image_instance_segmentation.py
@@ -12,11 +12,16 @@ from modelscope.pipelines.cv import ImageInstanceSegmentationPipeline
 from modelscope.preprocessors import build_preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageInstanceSegmentationTest(unittest.TestCase):
-    model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+class ImageInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+
     image = 'data/test/images/image_instance_segmentation.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageInstanceSegmentationTest(unittest.TestCase):
         print(f'pipeline1:{pipeline1(input=self.image)[OutputKeys.LABELS]}')
         print(f'pipeline2: {pipeline2(input=self.image)[OutputKeys.LABELS]}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 83b7fee2..2d78f164 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -1,19 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-import tempfile
 import unittest
 
 import cv2
 
-from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageMattingTest(unittest.TestCase):
+class ImageMattingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-matting'
@@ -62,6 +61,10 @@ class ImageMattingTest(unittest.TestCase):
             f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'
         )
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
index 3f07adf5..8c23ee6c 100644
--- a/tests/pipelines/test_image_panoptic_segmentation.py
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -7,16 +7,20 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImagePanopticSegmentationTest(unittest.TestCase):
+class ImagePanopticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_panoptic_segmentation(self):
         input_location = 'data/test/images/image_panoptic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
-        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = pan_segmentor(input_location)
 
         draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -26,8 +30,7 @@ class ImagePanopticSegmentationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_panoptic_segmentation_from_PIL(self):
         input_location = 'data/test/images/image_panoptic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
-        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
         PIL_array = PIL.Image.open(input_location)
         result = pan_segmentor(PIL_array)
 
@@ -35,6 +38,10 @@ class ImagePanopticSegmentationTest(unittest.TestCase):
         cv2.imwrite('result.jpg', draw_img)
         print('print test_image_panoptic_segmentation from PIL return success')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_portrait_enhancement.py b/tests/pipelines/test_image_portrait_enhancement.py
index 834fcfdb..83a70a0c 100644
--- a/tests/pipelines/test_image_portrait_enhancement.py
+++ b/tests/pipelines/test_image_portrait_enhancement.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImagePortraitEnhancementTest(unittest.TestCase):
+class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_portrait_enhancement
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
         self.test_image = 'data/test/images/Solvay_conference_1927.png'
 
@@ -37,6 +39,10 @@ class ImagePortraitEnhancementTest(unittest.TestCase):
         face_enhancement = pipeline(Tasks.image_portrait_enhancement)
         self.pipeline_inference(face_enhancement, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_reid_person.py b/tests/pipelines/test_image_reid_person.py
index c3e8d487..a4074b58 100644
--- a/tests/pipelines/test_image_reid_person.py
+++ b/tests/pipelines/test_image_reid_person.py
@@ -6,14 +6,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageReidPersonTest(unittest.TestCase):
+class ImageReidPersonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/image_reid_person.jpg'
         self.model_id = 'damo/cv_passvitb_image-reid-person_market'
+        self.task = Tasks.image_reid_person
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_reid_person(self):
@@ -48,6 +50,10 @@ class ImageReidPersonTest(unittest.TestCase):
         )
         print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
index 6738976c..82e606a3 100644
--- a/tests/pipelines/test_image_semantic_segmentation.py
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -7,17 +7,20 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
-from modelscope.utils.logger import get_logger
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageSemanticSegmentationTest(unittest.TestCase):
+class ImageSemanticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'image-segmentation'
+        self.model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_semantic_segmentation_panmerge(self):
         input_location = 'data/test/images/image_semantic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
-        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = segmenter(input_location)
 
         draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -34,8 +37,7 @@ class ImageSemanticSegmentationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_semantic_segmentation_vitadapter(self):
         input_location = 'data/test/images/image_semantic_segmentation.jpg'
-        model_id = 'damo/cv_vitadapter_semantic-segmentation_cocostuff164k'
-        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = segmenter(input_location)
 
         draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -49,6 +51,10 @@ class ImageSemanticSegmentationTest(unittest.TestCase):
         cv2.imwrite('result.jpg', draw_img)
         print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index 4e5bb69b..4b596cc9 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -7,12 +7,14 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageStyleTransferTest(unittest.TestCase):
+class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_style_transfer
         self.model_id = 'damo/cv_aams_style-transfer_damo'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -48,6 +50,10 @@ class ImageStyleTransferTest(unittest.TestCase):
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_super_resolution.py b/tests/pipelines/test_image_super_resolution.py
index 8cf9e46f..cd3822c3 100644
--- a/tests/pipelines/test_image_super_resolution.py
+++ b/tests/pipelines/test_image_super_resolution.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageSuperResolutionTest(unittest.TestCase):
+class ImageSuperResolutionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_rrdb_image-super-resolution'
         self.img = 'data/test/images/dogs.jpg'
+        self.task = Tasks.image_super_resolution
 
     def pipeline_inference(self, pipeline: Pipeline, img: str):
         result = pipeline(img)
@@ -35,6 +37,10 @@ class ImageSuperResolutionTest(unittest.TestCase):
         super_resolution = pipeline(Tasks.image_super_resolution)
         self.pipeline_inference(super_resolution, self.img)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 32a853af..20636a42 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -10,6 +10,7 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
@@ -25,7 +26,7 @@ NEG_TESTSETS_FILE = 'neg_testsets.tar.gz'
 NEG_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/neg_testsets.tar.gz'
 
 
-class KeyWordSpottingTest(unittest.TestCase):
+class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav': {
             'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
@@ -272,6 +273,10 @@ class KeyWordSpottingTest(unittest.TestCase):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_roc', kws_result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_live_category.py b/tests/pipelines/test_live_category.py
index dead376d..835bc602 100644
--- a/tests/pipelines/test_live_category.py
+++ b/tests/pipelines/test_live_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class LiveCategoryTest(unittest.TestCase):
+class LiveCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.live_category
+        self.model_id = 'damo/cv_resnet50_live-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.live_category, model='damo/cv_resnet50_live-category')
+        category_pipeline = pipeline(Tasks.live_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/live_category_test_video.mp4')
 
         print(f'live category output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
index 5993c634..e2fdc224 100644
--- a/tests/pipelines/test_movie_scene_segmentation.py
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -3,17 +3,21 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MovieSceneSegmentationTest(unittest.TestCase):
+class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.movie_scene_segmentation
+        self.model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_movie_scene_segmentation(self):
         input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
-        model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
         movie_scene_segmentation_pipeline = pipeline(
-            Tasks.movie_scene_segmentation, model=model_id)
+            Tasks.movie_scene_segmentation, model=self.model_id)
         result = movie_scene_segmentation_pipeline(input_location)
         if result:
             print(result)
@@ -31,6 +35,10 @@ class MovieSceneSegmentationTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 642ac11d..55930b13 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -7,10 +7,15 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MplugTasksTest(unittest.TestCase):
+class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'visual-question-answering'
+        self.model_id = 'damo/mplug_visual-question-answering_coco_large_en'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_model(self):
@@ -75,6 +80,10 @@ class MplugTasksTest(unittest.TestCase):
         result = pipeline_retrieval(input)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index f94e31fa..3d296370 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.multi_modal_embedding
+        self.model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+
     test_input = {'text': '皮卡丘'}
     model_version = 'dev'
 
@@ -54,6 +59,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
         print('l2-norm: {}'.format(torch.norm(text_embedding,
                                               dim=-1).item()))  # should be 1.0
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index ad0fa228..2c8d7b70 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -9,10 +9,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class NamedEntityRecognitionTest(unittest.TestCase):
+class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.named_entity_recognition
+        self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
@@ -88,6 +94,10 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 1d3fba12..80c69a01 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class NLITest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_nli_chinese-base'
+class NLITest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.nli
+        self.model_id = 'damo/nlp_structbert_nli_chinese-base'
+
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
     regress_tool = MsRegressTool(baseline=False)
@@ -52,6 +57,10 @@ class NLITest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.nli)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index de16aaa1..a754a517 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -3,10 +3,15 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ObjectDetectionTest(unittest.TestCase):
+class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.human_detection
+        self.model_id = 'damo/cv_resnet18_human-detection'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_object_detection(self):
@@ -50,6 +55,10 @@ class ObjectDetectionTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index a4201512..eeaa9d7a 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -4,14 +4,16 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRDetectionTest(unittest.TestCase):
+class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
         self.test_image = 'data/test/images/ocr_detection.jpg'
+        self.task = Tasks.ocr_detection
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -28,6 +30,10 @@ class OCRDetectionTest(unittest.TestCase):
         ocr_detection = pipeline(Tasks.ocr_detection)
         self.pipeline_inference(ocr_detection, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index a2e5ba8e..c4eb9e7a 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -1,26 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-import shutil
-import sys
-import tempfile
 import unittest
-from typing import Any, Dict, List, Tuple, Union
 
-import cv2
-import numpy as np
 import PIL
 
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRRecognitionTest(unittest.TestCase):
+class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo'
         self.test_image = 'data/test/images/ocr_recognition.jpg'
+        self.task = Tasks.ocr_recognition
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -42,6 +37,10 @@ class OCRRecognitionTest(unittest.TestCase):
         ocr_recognition = pipeline(Tasks.ocr_recognition)
         self.pipeline_inference(ocr_recognition, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 69bccac1..455b196b 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -11,10 +11,11 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import created_boxed_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OfaTasksTest(unittest.TestCase):
+class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.output_dir = 'unittest_output'
@@ -251,6 +252,10 @@ class OfaTasksTest(unittest.TestCase):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index 90aaa500..ef30d702 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -8,13 +8,15 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageCartoonTest(unittest.TestCase):
+class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_person-image-cartoon_compound-models'
+        self.task = Tasks.image_portrait_stylization
         self.test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
@@ -34,6 +36,10 @@ class ImageCartoonTest(unittest.TestCase):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_product_retrieval_embedding.py b/tests/pipelines/test_product_retrieval_embedding.py
index c416943e..f2b0a33d 100644
--- a/tests/pipelines/test_product_retrieval_embedding.py
+++ b/tests/pipelines/test_product_retrieval_embedding.py
@@ -6,11 +6,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ProductRetrievalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+class ProductRetrievalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.product_retrieval_embedding
+        self.model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+
     img_input = 'data/test/images/product_embed_bag.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -34,6 +39,10 @@ class ProductRetrievalEmbeddingTest(unittest.TestCase):
         result = product_embed(self.img_input)[OutputKeys.IMG_EMBEDDING]
         print('abs sum value is: {}'.format(np.sum(np.abs(result))))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
index 03ddacf4..25e8ffd4 100644
--- a/tests/pipelines/test_realtime_object_detection.py
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -2,22 +2,22 @@
 import unittest
 
 import cv2
-import numpy as np
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class RealtimeObjectDetectionTest(unittest.TestCase):
+class RealtimeObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
         self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
+        self.task = Tasks.image_object_detection
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
@@ -47,6 +47,10 @@ class RealtimeObjectDetectionTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index 20502a19..d9e260f2 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -1,8 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-import torch
-
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import InformationExtractionModel
@@ -10,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import InformationExtractionPipeline
 from modelscope.preprocessors import RelationExtractionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class RelationExtractionTest(unittest.TestCase):
-    model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.information_extraction
+        self.model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+
     sentence = '高捷，祖籍江苏，本科毕业于东南大学'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -52,6 +55,10 @@ class RelationExtractionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.information_extraction)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index ec010b17..52e84be7 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -4,10 +4,15 @@ import unittest
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SalientDetectionTest(unittest.TestCase):
+class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_u2net_salient-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_salient_detection(self):
@@ -19,6 +24,10 @@ class SalientDetectionTest(unittest.TestCase):
         # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 6990bf75..d9da1e65 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class SentenceSimilarityTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentence_similarity
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
     regress_tool = MsRegressTool(baseline=False)
@@ -58,6 +63,10 @@ class SentenceSimilarityTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 35c96282..939b7360 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -9,11 +9,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
 from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SentimentClassificationTaskModelTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+class SentimentClassificationTaskModelTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentiment_classification
+        self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+
     sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -60,6 +66,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase):
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_skin_retouching.py b/tests/pipelines/test_skin_retouching.py
index c6dbee2c..9e73334c 100644
--- a/tests/pipelines/test_skin_retouching.py
+++ b/tests/pipelines/test_skin_retouching.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SkinRetouchingTest(unittest.TestCase):
+class SkinRetouchingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.skin_retouching
         self.model_id = 'damo/cv_unet_skin-retouching'
         self.test_image = 'data/test/images/skin_retouching.png'
 
@@ -39,6 +41,10 @@ class SkinRetouchingTest(unittest.TestCase):
         skin_retouching = pipeline(Tasks.skin_retouching)
         self.pipeline_inference(skin_retouching, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 007e6c73..8ca6bf1d 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -1,11 +1,10 @@
 import os.path
-import shutil
 import unittest
 
-from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
@@ -14,7 +13,7 @@ FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
 NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
 
 
-class SpeechSignalProcessTest(unittest.TestCase):
+class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         pass
@@ -85,6 +84,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
             ans(data, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 542568d1..3a2870ea 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -6,14 +6,16 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SequenceClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
-from modelscope.utils.constant import Hubs, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SequenceClassificationTest(unittest.TestCase):
+class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
+        self.task = Tasks.text_classification
 
     def predict(self, pipeline_ins: SequenceClassificationPipeline):
         from easynlp.appzoo import load_dataset
@@ -87,6 +89,10 @@ class SequenceClassificationTest(unittest.TestCase):
         result = text_classification(dataset)
         self.printDataset(result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
index 741787d9..a693edac 100644
--- a/tests/pipelines/test_text_driven_segmentation.py
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -23,6 +23,10 @@ class TextDrivenSegmentationTest(unittest.TestCase):
         # result[OutputKeys.MASKS] is segment map result,other keys are not used
         cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index 5a1890ce..3400fbb7 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextErrorCorrectionPipeline
 from modelscope.preprocessors import TextErrorCorrectionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextErrorCorrectionTest(unittest.TestCase):
-    model_id = 'damo/nlp_bart_text-error-correction_chinese'
+class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_error_correction
+        self.model_id = 'damo/nlp_bart_text-error-correction_chinese'
+
     input = '随着中国经济突飞猛近，建造工业与日俱增'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -50,6 +55,10 @@ class TextErrorCorrectionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_error_correction)
         print(pipeline_ins(self.input))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index c08209a4..2a4d470d 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -8,10 +8,11 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextGenerationPipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextGenerationTest(unittest.TestCase):
+class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.palm_model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
@@ -128,6 +129,10 @@ class TextGenerationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.palm_input_zh))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py
index 32778ffb..5a5ed357 100644
--- a/tests/pipelines/test_text_to_image_synthesis.py
+++ b/tests/pipelines/test_text_to_image_synthesis.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextToImageSynthesisTest(unittest.TestCase):
-    model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+class TextToImageSynthesisTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+
     test_text = {
         'text': '宇航员',
         'generator_ddim_timesteps': 2,
@@ -46,6 +51,10 @@ class TextToImageSynthesisTest(unittest.TestCase):
             self.test_text)[OutputKeys.OUTPUT_IMG]
         print(np.sum(np.abs(img)))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 74cab01f..0a075352 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -10,6 +10,7 @@ from scipy.io.wavfile import write
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
@@ -18,22 +19,29 @@ import tensorflow as tf  # isort:skip
 logger = get_logger()
 
 
-class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
+class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
+                                                DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_speech
+        self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
         text = '今天北京天气怎么样？'
-        model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
         voice = 'zhitian_emo'
 
-        sambert_hifigan_tts = pipeline(
-            task=Tasks.text_to_speech, model=model_id)
+        sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id)
         self.assertTrue(sambert_hifigan_tts is not None)
         output = sambert_hifigan_tts(input=text, voice=voice)
         self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
         pcm = output[OutputKeys.OUTPUT_PCM]
         write('output.wav', 16000, pcm)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_classification.py b/tests/pipelines/test_tinynas_classification.py
index d64b5bc0..da5ca933 100644
--- a/tests/pipelines/test_tinynas_classification.py
+++ b/tests/pipelines/test_tinynas_classification.py
@@ -2,10 +2,15 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TinyNASClassificationTest(unittest.TestCase):
+class TinyNASClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_tinynas_classification'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -14,6 +19,10 @@ class TinyNASClassificationTest(unittest.TestCase):
         result = tinynas_classification('data/test/images/image_wolf.jpeg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index 6b2ecd0b..e9eaeb59 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -15,6 +15,10 @@ class TinynasObjectDetectionTest(unittest.TestCase):
             'data/test/images/image_detection.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py
index aba56676..98890bef 100644
--- a/tests/pipelines/test_video_category.py
+++ b/tests/pipelines/test_video_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoCategoryTest(unittest.TestCase):
+class VideoCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_category
+        self.model_id = 'damo/cv_resnet50_video-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.video_category, model='damo/cv_resnet50_video-category')
+        category_pipeline = pipeline(Tasks.video_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/video_category_test_video.mp4')
 
         print(f'video category output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_multi_modal_embedding.py b/tests/pipelines/test_video_multi_modal_embedding.py
index b33ba56c..9e26c967 100644
--- a/tests/pipelines/test_video_multi_modal_embedding.py
+++ b/tests/pipelines/test_video_multi_modal_embedding.py
@@ -4,15 +4,19 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class VideoMultiModalEmbeddingTest(unittest.TestCase):
+class VideoMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_multi_modal_embedding
+        self.model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
 
-    model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
     video_path = 'data/test/videos/multi_modal_test_video_9770.mp4'
     caption = ('a person is connecting something to system', None, None)
     _input = {'video': video_path, 'text': caption}
@@ -37,6 +41,10 @@ class VideoMultiModalEmbeddingTest(unittest.TestCase):
         logger.info('video feature: {}'.format(
             output['video_embedding'][0][0][0]))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
index fc228cd8..51d39c20 100644
--- a/tests/pipelines/test_video_single_object_tracking.py
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -5,12 +5,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import show_video_tracking_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SingleObjectTracking(unittest.TestCase):
+class SingleObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.video_single_object_tracking
         self.model_id = 'damo/cv_vitb_video-single-object-tracking_ostrack'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -33,6 +35,10 @@ class SingleObjectTracking(unittest.TestCase):
         result = video_single_object_tracking((video_path, init_bbox))
         print('result is : ', result[OutputKeys.BOXES])
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 12a0ee07..67c0cbd1 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -4,17 +4,21 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import show_video_summarization_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoSummarizationTest(unittest.TestCase):
+class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_summarization
+        self.model_id = 'damo/cv_googlenet_pgl-video-summarization'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        model_id = 'damo/cv_googlenet_pgl-video-summarization'
         video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(
-            Tasks.video_summarization, model=model_id)
+            Tasks.video_summarization, model=self.model_id)
         result = summarization_pipeline(video_path)
 
         print(f'video summarization output: \n{result}.')
@@ -29,6 +33,10 @@ class VideoSummarizationTest(unittest.TestCase):
 
         print(f'video summarization output:\n {result}.')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_virtual_try_on.py b/tests/pipelines/test_virtual_try_on.py
index 1979c9b8..07132c8a 100644
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -6,11 +6,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VirtualTryonTest(unittest.TestCase):
-    model_id = 'damo/cv_daflow_virtual-try-on_base'
+class VirtualTryonTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.virtual_try_on
+        self.model_id = 'damo/cv_daflow_virtual-try-on_base'
+
     masked_model = Image.open('data/test/images/virtual_tryon_model.jpg')
     pose = Image.open('data/test/images/virtual_tryon_pose.jpg')
     cloth = Image.open('data/test/images/virtual_tryon_cloth.jpg')
@@ -29,6 +34,10 @@ class VirtualTryonTest(unittest.TestCase):
         img = pipeline_virtual_tryon(self.input_imgs)[OutputKeys.OUTPUT_IMG]
         cv2.imwrite('demo.jpg', img[:, :, ::-1])
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 87006f96..835f59e7 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import shutil
 import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
@@ -9,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class WordSegmentationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.word_segmentation
+        self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+
     sentence = '今天天气不错，适合出去游玩'
     sentence_eng = 'I am a program.'
     regress_tool = MsRegressTool(baseline=False)
@@ -55,6 +59,10 @@ class WordSegmentationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.word_segmentation)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index f0f2a481..cdf6f31e 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class ZeroShotClassificationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.zero_shot_classification
+        self.model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
     template = '这篇文章的标题是{}'
@@ -65,6 +70,10 @@ class ZeroShotClassificationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
         print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()

From be2f31fc158e316831dd922d9298278e9a686c2c Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Thu, 8 Sep 2022 16:07:34 +0800
Subject: [PATCH 082/175] [to #42322933] Fix mplug model interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复 mplug 模型接口问题
2. 修复 mplug inference 不支持 batch 输入问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10052321
---
 .../multi_modal/mplug/modeling_mplug.py       |  2 ++
 .../models/multi_modal/mplug_for_all_tasks.py | 32 +++++++++++++------
 .../multi_modal/image_captioning_pipeline.py  |  4 +--
 .../image_text_retrieval_pipeline.py          |  2 +-
 .../visual_question_answering_pipeline.py     |  4 +--
 5 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index f469c218..ec491f1d 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1868,6 +1868,8 @@ class MPlug(PreTrainedModel):
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
                 checkpoint = checkpoint['model']
+            if 'module' in checkpoint:
+                checkpoint = checkpoint['module']
             checkpoint = {
                 k.replace('model.', ''): v
                 for k, v in checkpoint.items()
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index 608cc733..a06e5800 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,10 +1,13 @@
+import os.path as osp
 from typing import Dict, List
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['MPlugForAllTasks']
 
@@ -44,17 +47,28 @@ class MPlugForAllTasks(TorchModel):
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
+        # get task from config file
+        task = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
+
         # inference
         if not self.training and 'question' in input:
             output = self.model(input['image'], input['question'], train=False)
-            if not isinstance(output, tuple):
-                return output
+            if task == Tasks.image_text_retrieval:
+                return {OutputKeys.SCORES: output[0].tolist()}
             topk_ids, _ = output
-            pred_string: str = self.tokenizer.decode(topk_ids[0][0])
-            for _old, _new in replace_tokens_bert:
-                pred_string = pred_string.replace(_old, _new)
-            pred_string = pred_string.strip()
-            return pred_string
+            topk_ids = [topk_ids[i][0] for i in range(len(topk_ids))]
+            pred_strings: List[str] = \
+                self.tokenizer.batch_decode(topk_ids, skip_special_tokens=True)
+            output = []
+            for pred_string in pred_strings:
+                for _old, _new in replace_tokens_bert:
+                    pred_string = pred_string.replace(_old, _new)
+                pred_string = pred_string.strip()
+                output.append(pred_string)
+            output_key = OutputKeys.CAPTION \
+                if task == Tasks.image_captioning else OutputKeys.TEXT
+            return {output_key: output}
 
         # train and evaluate
         import addict
@@ -71,7 +85,7 @@ class MPlugForAllTasks(TorchModel):
             index = input['index']
             output = self.model(image, answer, index, train=self.training)
         if self.training:
-            return {'loss': output}
+            return {OutputKeys.LOSS: output}
 
         # evaluate
         topk_ids, _ = output
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 99cccee1..81a5f8cd 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -52,6 +52,4 @@ class ImageCaptioningPipeline(Pipeline):
             return super().forward(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.CAPTION: inputs}
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
index 1ebcf526..329d79bf 100644
--- a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -48,4 +48,4 @@ class ImageTextRetrievalPipeline(Pipeline):
             return super().forward(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return {OutputKeys.SCORES: inputs[0].tolist()}
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index b2442a3e..86177074 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -56,6 +56,4 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.TEXT: inputs}
+        return inputs

From 652ec697b79c9c62c6afaed69ff2ae4f99d62b66 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 8 Sep 2022 20:16:14 +0800
Subject: [PATCH 083/175] refactor inputs format of model forward         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9673243

    * refactor inputs format of model forward
---
 modelscope/models/base/base_head.py        | 21 ++++++++-------------
 modelscope/models/base/base_model.py       | 22 ++++++++--------------
 modelscope/models/base/base_torch_head.py  |  8 +++-----
 modelscope/models/base/base_torch_model.py | 13 ++++++-------
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/modelscope/models/base/base_head.py b/modelscope/models/base/base_head.py
index 07a68253..11bda32f 100644
--- a/modelscope/models/base/base_head.py
+++ b/modelscope/models/base/base_head.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
-from typing import Dict, Union
+from typing import Any, Dict, Union
 
 from modelscope.models.base.base_model import Model
 from modelscope.utils.config import ConfigDict
@@ -22,25 +22,20 @@ class Head(ABC):
         self.config = ConfigDict(kwargs)
 
     @abstractmethod
-    def forward(self, input: Input) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         This method will use the output from backbone model to do any
-        downstream tasks
-        Args:
-            input: The tensor output or a model from backbone model
-            (text generation need a model as input)
-        Returns: The output from downstream taks
+        downstream tasks. Recieve The output from backbone model.
+
+        Returns (Dict[str, Any]): The output from downstream task.
         """
         pass
 
     @abstractmethod
-    def compute_loss(self, outputs: Dict[str, Tensor],
-                     labels) -> Dict[str, Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         """
-        compute loss for head during the finetuning
+        compute loss for head during the finetuning.
 
-        Args:
-            outputs (Dict[str, Tensor]):  the output from the model forward
-        Returns:  the loss(Dict[str, Tensor]):
+        Returns (Dict[str, Any]): The loss dict
         """
         pass
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 872c42e8..8744ce1c 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -2,7 +2,7 @@
 import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
@@ -10,8 +10,6 @@ from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
-from modelscope.utils.file_utils import func_receive_dict_inputs
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,35 +25,31 @@ class Model(ABC):
         verify_device(device_name)
         self._device_name = device_name
 
-    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        return self.postprocess(self.forward(input))
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        return self.postprocess(self.forward(*args, **kwargs))
 
     @abstractmethod
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         Run the forward pass for a model.
 
-        Args:
-            input (Dict[str, Tensor]): the dict of the model inputs for the forward method
-
         Returns:
-            Dict[str, Tensor]: output from the model forward pass
+            Dict[str, Any]: output from the model forward pass
         """
         pass
 
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """ Model specific postprocess and convert model output to
         standard model outputs.
 
         Args:
-            input:  input data
+            inputs:  input data
 
         Return:
             dict of results:  a dict containing outputs of model, each
                 output should have the standard output name.
         """
-        return input
+        return inputs
 
     @classmethod
     def _instantiate(cls, **kwargs):
diff --git a/modelscope/models/base/base_torch_head.py b/modelscope/models/base/base_torch_head.py
index c5a78519..faee4296 100644
--- a/modelscope/models/base/base_torch_head.py
+++ b/modelscope/models/base/base_torch_head.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict
+from typing import Any, Dict
 
 import torch
 
@@ -18,10 +18,8 @@ class TorchHead(Head, torch.nn.Module):
         super().__init__(**kwargs)
         torch.nn.Module.__init__(self)
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index cfc88721..3c99a1f2 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict
 
 import torch
 from torch import nn
@@ -21,15 +21,14 @@ class TorchModel(Model, torch.nn.Module):
         super().__init__(model_dir, *args, **kwargs)
         torch.nn.Module.__init__(self)
 
-    def __call__(self, input: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        # Adapting a model with only one dict arg, and the arg name must be input or inputs
         if func_receive_dict_inputs(self.forward):
-            return self.postprocess(self.forward(input))
+            return self.postprocess(self.forward(args[0], **kwargs))
         else:
-            return self.postprocess(self.forward(**input))
+            return self.postprocess(self.forward(*args, **kwargs))
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
     def post_init(self):

From 5e176da3a1b83a77d628ce7e673fb31bbb7ca115 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Fri, 9 Sep 2022 10:01:51 +0800
Subject: [PATCH 084/175] adapt to msdataset for EasyCV         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9935664

    * adapt to msdataset for EasyCV
---
 modelscope/msdatasets/cv/easycv_base.py       | 59 +++++++++++++
 .../segmentation_dataset.py                   | 46 +++++++++-
 .../cv/object_detection/detection_dataset.py  | 64 +++++++++++++-
 modelscope/trainers/easycv/trainer.py         |  8 --
 .../trainers/easycv/utils/register_util.py    | 54 ++++++++++--
 modelscope/trainers/trainer.py                | 42 ++++++---
 modelscope/utils/test_utils.py                | 24 ++++-
 tests/trainers/easycv/test_easycv_trainer.py  | 87 +++++++++----------
 tests/trainers/easycv/test_segformer.py       | 56 +++---------
 tests/trainers/test_trainer.py                | 10 +--
 tests/trainers/test_trainer_gpu.py            |  5 +-
 11 files changed, 322 insertions(+), 133 deletions(-)
 create mode 100644 modelscope/msdatasets/cv/easycv_base.py

diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/cv/easycv_base.py
new file mode 100644
index 00000000..92b77389
--- /dev/null
+++ b/modelscope/msdatasets/cv/easycv_base.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+
+
+class EasyCVBaseDataset(object):
+    """Adapt to MSDataset.
+    Subclasses need to implement ``DATA_STRUCTURE``, the format is as follows, e.g.:
+
+    {
+        '${data source name}': {
+            'train':{
+                '${image root arg}': 'images',  # directory name of images relative to the root path
+                '${label root arg}': 'labels',  # directory name of lables relative to the root path
+                ...
+            },
+            'validation': {
+                '${image root arg}': 'images',
+                '${label root arg}': 'labels',
+                ...
+            }
+        }
+    }
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+    DATA_STRUCTURE = None
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 args=(),
+                 kwargs={}) -> None:
+        self.split_config = split_config
+        self.preprocessor = preprocessor
+        self.mode = mode
+        if self.split_config is not None:
+            self._update_data_source(kwargs['data_source'])
+
+    def _update_data_source(self, data_source):
+        data_root = next(iter(self.split_config.values()))
+        split = next(iter(self.split_config.keys()))
+
+        # TODO: msdataset should support these keys to be configured in the dataset's json file and passed in
+        if data_source['type'] not in list(self.DATA_STRUCTURE.keys()):
+            raise ValueError(
+                'Only support %s now, but get %s.' %
+                (list(self.DATA_STRUCTURE.keys()), data_source['type']))
+
+        # join data root path of msdataset and default relative name
+        update_args = self.DATA_STRUCTURE[data_source['type']][split]
+        for k, v in update_args.items():
+            data_source.update({k: osp.join(data_root, v)})
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
index 21114c11..c53e1431 100644
--- a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -1,21 +1,65 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
 from easycv.datasets.segmentation import SegDataset as _SegDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
+class EasyCVSegBaseDataset(EasyCVBaseDataset):
+    DATA_STRUCTURE = {
+        # data source name
+        'SegSourceRaw': {
+            'train': {
+                'img_root':
+                'images',  # directory name of images relative to the root path
+                'label_root':
+                'annotations',  # directory name of annotation relative to the root path
+                'split':
+                'train.txt'  # split file name relative to the root path
+            },
+            'validation': {
+                'img_root': 'images',
+                'label_root': 'annotations',
+                'split': 'val.txt'
+            }
+        }
+    }
+
+
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
-class SegDataset(_SegDataset):
+class SegDataset(EasyCVSegBaseDataset, _SegDataset):
     """EasyCV dataset for Sementic segmentation.
     For more details, please refer to :
     https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source: Data source config to parse input data.
         pipeline: Sequence of transform object or config dict to be composed.
         ignore_index (int): Label index to be ignored.
         profiling: If set True, will print transform time.
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVSegBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _SegDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
index 5b130a3e..e3aaaa92 100644
--- a/modelscope/msdatasets/cv/object_detection/detection_dataset.py
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -1,31 +1,71 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
 from easycv.datasets.detection import DetDataset as _DetDataset
 from easycv.datasets.detection import \
     DetImagesMixDataset as _DetImagesMixDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
+class EasyCVDetBaseDataset(EasyCVBaseDataset):
+    DATA_STRUCTURE = {
+        'DetSourceCoco': {
+            'train': {
+                'ann_file':
+                'train.json',  # file name of annotation relative to the root path
+                'img_prefix':
+                'images',  # directory name of images relative to the root path
+            },
+            'validation': {
+                'ann_file': 'val.json',
+                'img_prefix': 'images',
+            }
+        }
+    }
+
+
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
-class DetDataset(_DetDataset):
+class DetDataset(EasyCVDetBaseDataset, _DetDataset):
     """EasyCV dataset for object detection.
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source: Data source config to parse input data.
         pipeline: Transform config list
         profiling: If set True, will print pipeline time
         classes: A list of class names, used in evaluation for result and groundtruth visualization
     """
 
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVDetBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetDataset.__init__(self, *args, **kwargs)
+
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection,
     module_name=Datasets.DetImagesMixDataset)
-class DetImagesMixDataset(_DetImagesMixDataset):
+class DetImagesMixDataset(EasyCVDetBaseDataset, _DetImagesMixDataset):
     """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
     Suitable for training on multiple images mixed data augmentation like
     mosaic and mixup. For the augmentation pipeline of mixed image data,
@@ -38,6 +78,11 @@ class DetImagesMixDataset(_DetImagesMixDataset):
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/mix.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source (:obj:`DetSourceCoco`): Data source config to parse input data.
         pipeline (Sequence[dict]): Sequence of transform object or
             config dict to be composed.
@@ -47,3 +92,18 @@ class DetImagesMixDataset(_DetImagesMixDataset):
             be skip pipeline. Default to None.
         label_padding: out labeling padding [N, 120, 5]
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVDetBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetImagesMixDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/trainers/easycv/trainer.py b/modelscope/trainers/easycv/trainer.py
index dee06a41..3c869495 100644
--- a/modelscope/trainers/easycv/trainer.py
+++ b/modelscope/trainers/easycv/trainer.py
@@ -27,7 +27,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
     """Epoch based Trainer for EasyCV.
 
     Args:
-        task: Task name.
         cfg_file(str): The config file of EasyCV.
         model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
             or a model id. If model is None, build_model method will be called.
@@ -51,7 +50,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
 
     def __init__(
             self,
-            task: str,
             cfg_file: Optional[str] = None,
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             arg_parse_fn: Optional[Callable] = None,
@@ -64,7 +62,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
             **kwargs):
 
-        self.task = task
         register_util.register_parallel()
         register_util.register_part_mmcv_hooks_to_ms()
 
@@ -168,8 +165,3 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
             device_ids=[torch.cuda.current_device()])
 
         return build_parallel(dp_cfg)
-
-    def rebuild_config(self, cfg: Config):
-        cfg.task = self.task
-
-        return cfg
diff --git a/modelscope/trainers/easycv/utils/register_util.py b/modelscope/trainers/easycv/utils/register_util.py
index f80eaace..04bf719b 100644
--- a/modelscope/trainers/easycv/utils/register_util.py
+++ b/modelscope/trainers/easycv/utils/register_util.py
@@ -4,16 +4,49 @@ import logging
 
 from modelscope.trainers.hooks import HOOKS
 from modelscope.trainers.parallel.builder import PARALLEL
+from modelscope.utils.registry import default_group
+
+
+class _RegisterManager:
+
+    def __init__(self):
+        self.registries = {}
+
+    def add(self, module, name, group_key=default_group):
+        if module.name not in self.registries:
+            self.registries[module.name] = {}
+        if group_key not in self.registries[module.name]:
+            self.registries[module.name][group_key] = []
+
+        self.registries[module.name][group_key].append(name)
+
+    def exists(self, module, name, group_key=default_group):
+        if self.registries.get(module.name, None) is None:
+            return False
+        if self.registries[module.name].get(group_key, None) is None:
+            return False
+        if name in self.registries[module.name][group_key]:
+            return True
+
+        return False
+
+
+_dynamic_register = _RegisterManager()
 
 
 def register_parallel():
     from mmcv.parallel import MMDistributedDataParallel, MMDataParallel
 
-    PARALLEL.register_module(
-        module_name='MMDistributedDataParallel',
-        module_cls=MMDistributedDataParallel)
-    PARALLEL.register_module(
-        module_name='MMDataParallel', module_cls=MMDataParallel)
+    mmddp = 'MMDistributedDataParallel'
+    mmdp = 'MMDataParallel'
+
+    if not _dynamic_register.exists(PARALLEL, mmddp):
+        _dynamic_register.add(PARALLEL, mmddp)
+        PARALLEL.register_module(
+            module_name=mmddp, module_cls=MMDistributedDataParallel)
+    if not _dynamic_register.exists(PARALLEL, mmdp):
+        _dynamic_register.add(PARALLEL, mmdp)
+        PARALLEL.register_module(module_name=mmdp, module_cls=MMDataParallel)
 
 
 def register_hook_to_ms(hook_name, logger=None):
@@ -24,6 +57,10 @@ def register_hook_to_ms(hook_name, logger=None):
         raise ValueError(
             f'Not found hook "{hook_name}" in EasyCV hook registries!')
 
+    if _dynamic_register.exists(HOOKS, hook_name):
+        return
+    _dynamic_register.add(HOOKS, hook_name)
+
     obj = _EV_HOOKS._module_dict[hook_name]
     HOOKS.register_module(module_name=hook_name, module_cls=obj)
 
@@ -41,18 +78,19 @@ def register_part_mmcv_hooks_to_ms():
     from mmcv.runner.hooks import lr_updater
     from mmcv.runner.hooks import HOOKS as _MMCV_HOOKS
     from easycv.hooks import StepFixCosineAnnealingLrUpdaterHook, YOLOXLrUpdaterHook
-    from easycv.hooks.logger import PreLoggerHook
 
     mmcv_hooks_in_easycv = [('StepFixCosineAnnealingLrUpdaterHook',
                              StepFixCosineAnnealingLrUpdaterHook),
-                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook),
-                            ('PreLoggerHook', PreLoggerHook)]
+                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook)]
 
     members = inspect.getmembers(lr_updater)
     members.extend(mmcv_hooks_in_easycv)
 
     for name, obj in members:
         if name in _MMCV_HOOKS._module_dict:
+            if _dynamic_register.exists(HOOKS, name):
+                continue
+            _dynamic_register.add(HOOKS, name)
             HOOKS.register_module(
                 module_name=name,
                 module_cls=obj,
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index fa6f8a99..63a231b3 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -164,10 +164,14 @@ class EpochBasedTrainer(BaseTrainer):
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
+            task_data_config=self.cfg.dataset.get('train', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.train_preprocessor)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
+            task_data_config=self.cfg.dataset.get('val', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.eval_preprocessor)
 
         self.train_data_collator, self.eval_default_collate = None, None
@@ -298,6 +302,7 @@ class EpochBasedTrainer(BaseTrainer):
     def to_task_dataset(self,
                         datasets: Union[Dataset, List[Dataset]],
                         mode: str,
+                        task_data_config: Config = None,
                         preprocessor: Optional[Preprocessor] = None):
         """Build the task specific dataset processor for this trainer.
 
@@ -310,20 +315,29 @@ class EpochBasedTrainer(BaseTrainer):
             if isinstance(datasets, TorchTaskDataset):
                 return datasets
             elif isinstance(datasets, MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 return datasets.to_torch_dataset(
-                    task_data_config=cfg,
-                    task_name=self.cfg.task
-                    if hasattr(self.cfg, ConfigFields.task) else None,
+                    task_data_config=task_data_config,
+                    task_name=self.cfg.task,
                     preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 datasets = [
                     d.to_torch_dataset(
-                        task_data_config=cfg,
+                        task_data_config=task_data_config,
                         task_name=self.cfg.task,
                         preprocessors=preprocessor) for d in datasets
                 ]
@@ -331,12 +345,12 @@ class EpochBasedTrainer(BaseTrainer):
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
             else:
-                cfg = ConfigDict(
-                    type=self.cfg.model.type,
-                    mode=mode,
-                    datasets=datasets,
-                    preprocessor=preprocessor)
-                return build_task_dataset(cfg, self.cfg.task)
+                task_data_config.update(
+                    dict(
+                        mode=mode,
+                        datasets=datasets,
+                        preprocessor=preprocessor))
+                return build_task_dataset(task_data_config, self.cfg.task)
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index b30c674b..8fb621d3 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -14,10 +14,10 @@ import unittest
 from typing import OrderedDict
 
 import requests
-from datasets import Dataset
+import torch
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
+from torch.utils.data import Dataset
 
-from modelscope.msdatasets import MsDataset
 from .torch_utils import _find_free_port
 
 TEST_LEVEL = 2
@@ -49,9 +49,25 @@ def set_test_level(level: int):
     TEST_LEVEL = level
 
 
+class DummyTorchDataset(Dataset):
+
+    def __init__(self, feat, label, num) -> None:
+        self.feat = feat
+        self.label = label
+        self.num = num
+
+    def __getitem__(self, index):
+        return {
+            'feat': torch.Tensor(self.feat),
+            'labels': torch.Tensor(self.label)
+        }
+
+    def __len__(self):
+        return self.num
+
+
 def create_dummy_test_dataset(feat, label, num):
-    return MsDataset.from_hf_dataset(
-        Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))
+    return DummyTorchDataset(feat, label, num)
 
 
 def download_and_untar(fpath, furl, dst) -> str:
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
index 6d1d7ec4..4bd63c55 100644
--- a/tests/trainers/easycv/test_easycv_trainer.py
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -6,10 +6,10 @@ import tempfile
 import unittest
 
 import json
-import requests
 import torch
 
 from modelscope.metainfo import Models, Pipelines, Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import LogKeys, ModeKeys, Tasks
@@ -18,55 +18,19 @@ from modelscope.utils.test_utils import DistributedTestCase, test_level
 from modelscope.utils.torch_utils import is_master
 
 
-def _download_data(url, save_dir):
-    r = requests.get(url, verify=True)
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    zip_name = os.path.split(url)[-1]
-    save_path = os.path.join(save_dir, zip_name)
-    with open(save_path, 'wb') as f:
-        f.write(r.content)
-
-    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
-    shutil.unpack_archive(save_path, unpack_dir)
-
-
-def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
+def train_func(work_dir, dist=False, log_interval=3, imgs_per_gpu=4):
     import easycv
     config_path = os.path.join(
         os.path.dirname(easycv.__file__),
         'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
 
-    data_dir = os.path.join(work_dir, 'small_coco_test')
-    url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco.zip'
-    if is_master():
-        _download_data(url, data_dir)
-
-    import time
-    time.sleep(1)
     cfg = Config.from_file(config_path)
 
-    cfg.work_dir = work_dir
-    cfg.total_epochs = 2
-    cfg.checkpoint_config.interval = 1
-    cfg.eval_config.interval = 1
-    cfg.log_config = dict(
-        interval=log_config,
-        hooks=[
+    cfg.log_config.update(
+        dict(hooks=[
             dict(type='TextLoggerHook'),
             dict(type='TensorboardLoggerHook')
-        ])
-    cfg.data.train.data_source.ann_file = os.path.join(
-        data_dir, 'small_coco/small_coco/instances_train2017_20.json')
-    cfg.data.train.data_source.img_prefix = os.path.join(
-        data_dir, 'small_coco/small_coco/train2017')
-    cfg.data.val.data_source.ann_file = os.path.join(
-        data_dir, 'small_coco/small_coco/instances_val2017_20.json')
-    cfg.data.val.data_source.img_prefix = os.path.join(
-        data_dir, 'small_coco/small_coco/val2017')
-    cfg.data.imgs_per_gpu = imgs_per_gpu
-    cfg.data.workers_per_gpu = 2
-    cfg.data.val.imgs_per_gpu = 2
+        ]))  # not support TensorboardLoggerHookV2
 
     ms_cfg_file = os.path.join(work_dir, 'ms_yolox_s_8xb16_300e_coco.json')
     from easycv.utils.ms_utils import to_ms_config
@@ -81,9 +45,41 @@ def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
             save_path=ms_cfg_file)
 
     trainer_name = Trainers.easycv
+    train_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test', namespace='EasyCV', split='train')
+    eval_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test',
+        namespace='EasyCV',
+        split='validation')
+
+    cfg_options = {
+        'train.max_epochs':
+        2,
+        'train.dataloader.batch_size_per_gpu':
+        imgs_per_gpu,
+        'evaluation.dataloader.batch_size_per_gpu':
+        2,
+        'train.hooks': [
+            {
+                'type': 'CheckpointHook',
+                'interval': 1
+            },
+            {
+                'type': 'EvaluationHook',
+                'interval': 1
+            },
+            {
+                'type': 'TextLoggerHook',
+                'interval': log_interval
+            },
+        ]
+    }
     kwargs = dict(
-        task=Tasks.image_object_detection,
         cfg_file=ms_cfg_file,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        work_dir=work_dir,
+        cfg_options=cfg_options,
         launcher='pytorch' if dist else None)
 
     trainer = build_trainer(trainer_name, kwargs)
@@ -105,11 +101,8 @@ class EasyCVTrainerTestSingleGpu(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
-    @unittest.skipIf(
-        True, 'The test cases are all run in the master process, '
-        'cause registry conflicts, and it should run in the subprocess.')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu(self):
-        # TODO: run in subprocess
         train_func(self.tmp_dir)
 
         results_files = os.listdir(self.tmp_dir)
@@ -185,7 +178,7 @@ class EasyCVTrainerTestMultiGpus(DistributedTestCase):
             num_gpus=2,
             work_dir=self.tmp_dir,
             dist=True,
-            log_config=2,
+            log_interval=2,
             imgs_per_gpu=5)
 
         results_files = os.listdir(self.tmp_dir)
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
index 0da47ef6..08da6e41 100644
--- a/tests/trainers/easycv/test_segformer.py
+++ b/tests/trainers/easycv/test_segformer.py
@@ -5,28 +5,14 @@ import shutil
 import tempfile
 import unittest
 
-import requests
 import torch
 
 from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, Tasks
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
-from modelscope.utils.torch_utils import is_master
-
-
-def _download_data(url, save_dir):
-    r = requests.get(url, verify=True)
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    zip_name = os.path.split(url)[-1]
-    save_path = os.path.join(save_dir, zip_name)
-    with open(save_path, 'wb') as f:
-        f.write(r.content)
-
-    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
-    shutil.unpack_archive(save_path, unpack_dir)
 
 
 @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
@@ -45,46 +31,32 @@ class EasyCVTrainerTestSegformer(unittest.TestCase):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
     def _train(self):
-        from modelscope.trainers.easycv.trainer import EasyCVEpochBasedTrainer
-
-        url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco_stuff164k.zip'
-        data_dir = os.path.join(self.tmp_dir, 'data')
-        if is_master():
-            _download_data(url, data_dir)
-
-        # adapt to ditributed mode
+        # adapt to distributed mode
         from easycv.utils.test_util import pseudo_dist_init
         pseudo_dist_init()
 
-        root_path = os.path.join(data_dir, 'small_coco_stuff164k')
-        cfg_options = {
-            'train.max_epochs':
-            2,
-            'dataset.train.data_source.img_root':
-            os.path.join(root_path, 'train2017'),
-            'dataset.train.data_source.label_root':
-            os.path.join(root_path, 'annotations/train2017'),
-            'dataset.train.data_source.split':
-            os.path.join(root_path, 'train.txt'),
-            'dataset.val.data_source.img_root':
-            os.path.join(root_path, 'val2017'),
-            'dataset.val.data_source.label_root':
-            os.path.join(root_path, 'annotations/val2017'),
-            'dataset.val.data_source.split':
-            os.path.join(root_path, 'val.txt'),
-        }
+        cfg_options = {'train.max_epochs': 2}
 
         trainer_name = Trainers.easycv
+        train_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='validation')
         kwargs = dict(
-            task=Tasks.image_segmentation,
             model='EasyCV/EasyCV-Segformer-b0',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
             work_dir=self.tmp_dir,
             cfg_options=cfg_options)
 
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu_segformer(self):
         self._train()
 
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 86909f74..c73a56a3 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -64,7 +64,7 @@ class TrainerTest(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_0(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -139,7 +139,7 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_1(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -200,7 +200,7 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_default_config(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -319,7 +319,7 @@ class TrainerTest(unittest.TestCase):
         for i in [2, 5, 8]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_iters_per_epoch(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -441,7 +441,7 @@ class TrainerTest(unittest.TestCase):
 
 class DummyTrainerTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_dummy(self):
         default_args = dict(cfg_file='configs/examples/train.json')
         trainer = build_trainer('dummy', default_args)
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 3777772d..1f622287 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -17,7 +17,7 @@ from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
 from modelscope.models.base import Model
 from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
 
@@ -55,6 +55,7 @@ class DummyModel(nn.Module, Model):
 
 def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     json_cfg = {
+        'task': Tasks.image_classification,
         'train': {
             'work_dir': work_dir,
             'dataloader': {
@@ -119,7 +120,7 @@ class TrainerTestSingleGpu(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu(self):
         train_func(self.tmp_dir)
 

From 84c384cc57152005e8e45422cdecc4817cb042e8 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Fri, 9 Sep 2022 10:06:20 +0800
Subject: [PATCH 085/175] [to #42322933] add httpurl support for KWS        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10078262

---
 .../pipelines/audio/kws_kwsbp_pipeline.py     |  9 +++++
 modelscope/utils/audio/audio_utils.py         | 35 +++++++++++--------
 tests/pipelines/test_key_word_spotting.py     | 23 ++++++++++++
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 1f31766a..866b8d0b 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -8,6 +8,8 @@ from modelscope.models import Model
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToLists
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,6 +42,13 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
         if self.preprocessor is None:
             self.preprocessor = WavToLists()
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            audio_in = extract_pcm_from_wav(audio_in)
+
         output = self.preprocessor.forward(self.model.forward(), audio_in)
         output = self.forward(output)
         rst = self.postprocess(output)
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index c93e0102..4c2c45cc 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -42,23 +42,28 @@ def extract_pcm_from_wav(wav: bytes) -> bytes:
     if len(data) > 44:
         frame_len = 44
         file_len = len(data)
-        header_fields = {}
-        header_fields['ChunkID'] = str(data[0:4], 'UTF-8')
-        header_fields['Format'] = str(data[8:12], 'UTF-8')
-        header_fields['Subchunk1ID'] = str(data[12:16], 'UTF-8')
-        if header_fields['ChunkID'] == 'RIFF' and header_fields[
-                'Format'] == 'WAVE' and header_fields['Subchunk1ID'] == 'fmt ':
-            header_fields['SubChunk1Size'] = struct.unpack('<I',
-                                                           data[16:20])[0]
+        try:
+            header_fields = {}
+            header_fields['ChunkID'] = str(data[0:4], 'UTF-8')
+            header_fields['Format'] = str(data[8:12], 'UTF-8')
+            header_fields['Subchunk1ID'] = str(data[12:16], 'UTF-8')
+            if header_fields['ChunkID'] == 'RIFF' and header_fields[
+                    'Format'] == 'WAVE' and header_fields[
+                        'Subchunk1ID'] == 'fmt ':
+                header_fields['SubChunk1Size'] = struct.unpack(
+                    '<I', data[16:20])[0]
 
-            if header_fields['SubChunk1Size'] == 16:
-                frame_len = 44
-            elif header_fields['SubChunk1Size'] == 18:
-                frame_len = 46
-            else:
-                return data
+                if header_fields['SubChunk1Size'] == 16:
+                    frame_len = 44
+                elif header_fields['SubChunk1Size'] == 18:
+                    frame_len = 46
+                else:
+                    return data
 
-            data = wav[frame_len:file_len]
+                data = wav[frame_len:file_len]
+        except Exception:
+            # no treatment
+            pass
 
     return data
 
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 20636a42..2f06936f 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -18,6 +18,7 @@ logger = get_logger()
 
 POS_WAV_FILE = 'data/test/audios/kws_xiaoyunxiaoyun.wav'
 BOFANGYINYUE_WAV_FILE = 'data/test/audios/kws_bofangyinyue.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/20200707_xiaoyun.wav'
 
 POS_TESTSETS_FILE = 'pos_testsets.tar.gz'
 POS_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testsets.tar.gz'
@@ -76,6 +77,22 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
                 }]
             }
         },
+        'test_run_with_url': {
+            'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
+            'checking_value': '小云小云',
+            'example': {
+                'wav_count':
+                1,
+                'kws_type':
+                'pcm',
+                'kws_list': [{
+                    'keyword': '小云小云',
+                    'offset': 0.69,
+                    'length': 1.67,
+                    'confidence': 0.996023
+                }]
+            }
+        },
         'test_run_with_pos_testsets': {
             'checking_item': ['recall'],
             'example': {
@@ -237,6 +254,12 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
         self.check_result('test_run_with_wav_by_customized_keywords',
                           kws_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url(self):
+        kws_result = self.run_pipeline(
+            model_id=self.model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url', kws_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_pos_testsets(self):
         wav_file_path = download_and_untar(

From 4be7737122b0e09500de583a923f1e1366c09efc Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Fri, 9 Sep 2022 13:51:09 +0800
Subject: [PATCH 086/175] [to #42322933] audio pipelines accept url as input

---
 modelscope/pipelines/audio/ans_pipeline.py    |  9 +++--
 .../pipelines/audio/kws_farfield_pipeline.py  |  5 +++
 modelscope/preprocessors/audio.py             |  8 +++--
 .../test_key_word_spotting_farfield.py        | 12 ++++++-
 tests/pipelines/test_speech_signal_process.py | 33 +++++++++++++++++--
 5 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 5ed4d769..62399684 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -6,6 +6,7 @@ import numpy as np
 import soundfile as sf
 import torch
 
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
@@ -34,11 +35,12 @@ class ANSPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         self.model.eval()
 
-    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             data1, fs = sf.read(io.BytesIO(inputs))
         elif isinstance(inputs, str):
-            data1, fs = sf.read(inputs)
+            file_bytes = File.read(inputs)
+            data1, fs = sf.read(io.BytesIO(file_bytes))
         else:
             raise TypeError(f'Unsupported type {type(inputs)}.')
         if len(data1.shape) > 1:
@@ -50,7 +52,8 @@ class ANSPipeline(Pipeline):
         inputs = np.reshape(data, [1, data.shape[0]])
         return {'ndarray': inputs, 'nsamples': data.shape[0]}
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         ndarray = inputs['ndarray']
         if isinstance(ndarray, torch.Tensor):
             ndarray = ndarray.cpu().numpy()
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index a114e7fb..62848a27 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -2,6 +2,7 @@ import io
 import wave
 from typing import Any, Dict
 
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
@@ -39,6 +40,8 @@ class KWSFarfieldPipeline(Pipeline):
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             return dict(input_file=inputs)
+        elif isinstance(inputs, str):
+            return dict(input_file=inputs)
         elif isinstance(inputs, Dict):
             return inputs
         else:
@@ -47,6 +50,8 @@ class KWSFarfieldPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         input_file = inputs['input_file']
+        if isinstance(input_file, str):
+            input_file = File.read(input_file)
         if isinstance(input_file, bytes):
             input_file = io.BytesIO(input_file)
         self.frame_count = 0
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index 10057034..dd2f1fc1 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -6,9 +6,10 @@ import numpy as np
 import scipy.io.wavfile as wav
 import torch
 
+from modelscope.fileio import File
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
-from . import Preprocessor
-from .builder import PREPROCESSORS
 
 
 def load_kaldi_feature_transform(filename):
@@ -201,7 +202,8 @@ class LinearAECAndFbank(Preprocessor):
         if isinstance(inputs, bytes):
             inputs = io.BytesIO(inputs)
         elif isinstance(inputs, str):
-            pass
+            file_bytes = File.read(inputs)
+            inputs = io.BytesIO(file_bytes)
         else:
             raise TypeError(f'Unsupported input type: {type(inputs)}.')
         sample_rate, data = wav.read(inputs)
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index 4a732950..1b23a6a7 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -6,6 +6,9 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
+                  '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
 
 
 class KWSFarfieldTest(unittest.TestCase):
@@ -13,7 +16,7 @@ class KWSFarfieldTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_normal(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
         inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
@@ -21,6 +24,13 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_url(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        result = kws(TEST_SPEECH_URL)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_output(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 8ca6bf1d..e1987c28 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -9,8 +9,17 @@ from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
 FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
+NEAREND_MIC_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                  '&FilePath=examples/nearend_mic.wav'
+FAREND_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                    'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                    '&FilePath=examples/farend_speech.wav'
 
 NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
+NOISE_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                   'speech_frcrn_ans_cirm_16k/repo?Revision=master' \
+                   '&FilePath=examples/speech_with_noise.wav'
 
 
 class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
@@ -18,7 +27,7 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         pass
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {
@@ -30,6 +39,18 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
         aec(input, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_aec_url(self):
+        model_id = 'damo/speech_dfsmn_aec_psm_16k'
+        input = {
+            'nearend_mic': NEAREND_MIC_URL,
+            'farend_speech': FAREND_SPEECH_URL
+        }
+        aec = pipeline(Tasks.acoustic_echo_cancellation, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        aec(input, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec_bytes(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
@@ -62,7 +83,7 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
         aec(inputs, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
         ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
@@ -71,6 +92,14 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
             output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_ans_url(self):
+        model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_URL, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans_bytes(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'

From b41b10f8970a748dee26baf8e5e1d13e04568e54 Mon Sep 17 00:00:00 2001
From: ly119399 <ly119399@alibaba-inc.com>
Date: Fri, 9 Sep 2022 14:27:08 +0800
Subject: [PATCH 087/175] [to #42322933] space finetune on generation task     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10061562

---
 modelscope/metainfo.py                        |   1 +
 modelscope/models/nlp/space/model/__init__.py |   2 +-
 .../models/nlp/space/model/generator.py       |  10 +-
 .../nlp/space/space_for_dialog_modeling.py    |   2 +-
 .../space/dialog_modeling_preprocessor.py     |   2 +-
 .../preprocessors/space/fields/gen_field.py   | 236 ++++-
 .../nlp/space/dialog_modeling_trainer.py      | 130 +++
 modelscope/trainers/nlp/space/eval.py         | 952 ++++++++++++++++++
 .../trainers/nlp/space/trainer/gen_trainer.py |  72 +-
 modelscope/utils/nlp/space/clean_dataset.py   | 333 ++++++
 modelscope/utils/nlp/space/utils.py           |  12 +-
 .../trainers/test_dialog_modeling_trainer.py  |  68 ++
 12 files changed, 1744 insertions(+), 76 deletions(-)
 create mode 100644 modelscope/trainers/nlp/space/dialog_modeling_trainer.py
 create mode 100644 modelscope/trainers/nlp/space/eval.py
 create mode 100644 modelscope/utils/nlp/space/clean_dataset.py
 create mode 100644 tests/trainers/test_dialog_modeling_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index e051bb76..63b4f1c2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -241,6 +241,7 @@ class Trainers(object):
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
+    dialog_modeling_trainer = 'dialog-modeling-trainer'
     dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
index 24641f06..bb1d18e4 100644
--- a/modelscope/models/nlp/space/model/__init__.py
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -1,6 +1,6 @@
 from .configuration_space import SpaceConfig
 from .gen_unified_transformer import GenUnifiedTransformer
-from .generator import Generator as SpaceGenerator
+from .generator import SpaceGenerator
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .model_base import SpaceModelBase
 from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
index c1521e3d..0e7833e6 100644
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -38,24 +38,24 @@ def gather(var, idx):
         return var
 
 
-class Generator(object):
+class SpaceGenerator(object):
     """ Genrator class. """
 
     _registry = dict()
 
     @classmethod
     def register(cls, name):
-        Generator._registry[name] = cls
+        SpaceGenerator._registry[name] = cls
         return
 
     @staticmethod
     def by_name(name):
-        return Generator._registry[name]
+        return SpaceGenerator._registry[name]
 
     @staticmethod
     def create(config, *args, **kwargs):
         """ Create generator. """
-        generator_cls = Generator.by_name(config.Generator.generator)
+        generator_cls = SpaceGenerator.by_name(config.Generator.generator)
         return generator_cls(config, *args, **kwargs)
 
     def __init__(self, config, reader):
@@ -83,7 +83,7 @@ class Generator(object):
         raise NotImplementedError
 
 
-class BeamSearch(Generator):
+class BeamSearch(SpaceGenerator):
     """ BeamSearch generator. """
 
     def __init__(self, config, reader):
diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py
index 4c65c7d1..efa9b851 100644
--- a/modelscope/models/nlp/space/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py
@@ -41,7 +41,7 @@ class SpaceForDialogModeling(TorchModel):
 
         self.text_field = kwargs.pop(
             'text_field',
-            MultiWOZBPETextField(self.model_dir, config=self.config))
+            MultiWOZBPETextField(config=self.config, model_dir=self.model_dir))
         self.generator = SpaceGenerator.create(
             self.config, reader=self.text_field)
         self.model = SpaceModelBase.create(
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
index a2157c2b..c461ade1 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -35,7 +35,7 @@ class DialogModelingPreprocessor(Preprocessor):
         self.config.use_gpu = self.config.use_gpu and torch.cuda.is_available()
 
         self.text_field = MultiWOZBPETextField(
-            self.model_dir, config=self.config)
+            config=self.config, model_dir=self.model_dir)
 
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py
index 5bff360f..32346bd5 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -2,9 +2,11 @@
 
 import os
 import random
+from asyncio import constants
 from collections import OrderedDict
 from itertools import chain
 
+import json
 import numpy as np
 
 from modelscope.preprocessors.space.tokenizer import Tokenizer
@@ -117,7 +119,8 @@ class BPETextField(object):
         return self.tokenizer.convert_tokens_to_ids([self.eos_d_token])[0]
 
     def __init__(self, config):
-        self.gpu = 0
+        self.train, self.dev, self.test = [], [], []
+        self.gpu = config.Trainer.gpu
         self.tokenizer = None
         self.vocab = None
         self.db = None
@@ -249,13 +252,9 @@ class BPETextField(object):
         for dial in data:
             batch.append(dial)
             if len(batch) == self.batch_size:
-                # print('batch size: %d, batch num +1'%(len(batch)))
                 all_batches.append(batch)
                 batch = []
-        # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch
-        # print('last batch size: %d, batch num +1'%(len(batch)))
-        # if (len(batch) % len(cfg.cuda_device)) != 0:
-        #     batch = batch[:-(len(batch) % len(cfg.cuda_device))]
+
         # TODO deal with deleted data
         if self.gpu <= 1:
             if len(batch) > 0.5 * self.batch_size:
@@ -308,7 +307,7 @@ class BPETextField(object):
 
 class MultiWOZBPETextField(BPETextField):
 
-    def __init__(self, model_dir, config):
+    def __init__(self, config, **kwargs):
         super(MultiWOZBPETextField, self).__init__(config)
 
         import spacy
@@ -327,8 +326,12 @@ class MultiWOZBPETextField(BPETextField):
                 )
         self.nlp = spacy.load('en_core_web_sm')
 
+        if config.do_train:
+            db_dir = kwargs['data_dir']
+        else:
+            db_dir = kwargs['model_dir']
         self.db = MultiWozDB(
-            model_dir, {
+            db_dir, {
                 'attraction': 'db/attraction_db_processed.json',
                 'hospital': 'db/hospital_db_processed.json',
                 'hotel': 'db/hotel_db_processed.json',
@@ -337,14 +340,14 @@ class MultiWOZBPETextField(BPETextField):
                 'taxi': 'db/taxi_db_processed.json',
                 'train': 'db/train_db_processed.json',
             })
-        self._build_vocab(model_dir)
+        self._build_vocab(db_dir)
 
         special_tokens = [
             self.pad_token, self.bos_token, self.eos_token, self.unk_token
         ]
         special_tokens.extend(self.add_sepcial_tokens())
         self.tokenizer = Tokenizer(
-            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
+            vocab_path=os.path.join(kwargs['model_dir'], ModelFile.VOCAB_FILE),
             special_tokens=special_tokens,
             tokenizer_type=config.BPETextField.tokenizer_type)
         self.understand_ids = self.tokenizer.convert_tokens_to_ids(
@@ -352,6 +355,26 @@ class MultiWOZBPETextField(BPETextField):
         self.policy_ids = self.tokenizer.convert_tokens_to_ids(
             self.policy_tokens)
 
+        if config.do_train:
+            test_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'testListFile.json'),
+                    'r').readlines()
+            ]
+            dev_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'valListFile.json'),
+                    'r').readlines()
+            ]
+
+            self.dev_files, self.test_files = {}, {}
+            for fn in test_list:
+                self.test_files[fn.replace('.json', '')] = 1
+            for fn in dev_list:
+                self.dev_files[fn.replace('.json', '')] = 1
+
+            self._load_data(kwargs['data_dir'])
+
         return
 
     def get_ids(self, data: str):
@@ -414,7 +437,6 @@ class MultiWOZBPETextField(BPETextField):
         name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev}
         dial = name_to_set[set_name]
         turn_bucket = self._bucket_by_turn(dial)
-        # self._shuffle_turn_bucket(turn_bucket)
         all_batches = []
 
         if set_name not in self.set_stats:
@@ -433,19 +455,13 @@ class MultiWOZBPETextField(BPETextField):
             except Exception:
                 log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % (
                     k, len(turn_bucket[k]), len(batches), 0.0)
-            # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches)))
+
             num_training_steps += k * len(batches)
             num_turns += k * len(turn_bucket[k])
             num_dials += len(turn_bucket[k])
             all_batches += batches
         log_str += 'total batch num: %d\n' % len(all_batches)
-        # print('total batch num: %d'%len(all_batches))
-        # print('dialog count: %d'%dia_count)
-        # return all_batches
 
-        # log stats
-        # logging.info(log_str)
-        # cfg.num_training_steps = num_training_steps * cfg.epoch_num
         self.set_stats[set_name][
             'num_training_steps_per_epoch'] = num_training_steps  # turn-level steps
         self.set_stats[set_name]['num_turns'] = num_turns
@@ -484,6 +500,71 @@ class MultiWOZBPETextField(BPETextField):
         self.vocab.load_vocab(vp)
         return self.vocab.vocab_size
 
+    def _load_data(self, data_dir, save_temp=True):
+        """
+        load processed data and encode, or load already encoded data
+        """
+
+        def load_data_from_resource(data_resource):
+            data = json.loads(
+                open(
+                    os.path.join(data_dir, data_resource),
+                    'r',
+                    encoding='utf-8').read().lower())
+            train, dev, test = [], [], []
+            for fn, dial in data.items():
+                if '.json' in fn:
+                    fn = fn.replace('.json', '')
+                if self.dev_files.get(fn):
+                    dev.append(self._get_encoded_data(fn, dial))
+                elif self.test_files.get(fn):
+                    test.append(self._get_encoded_data(fn, dial))
+                else:
+                    train.append(self._get_encoded_data(fn, dial))
+            return train, dev, test
+
+        data_processed = 'new_db_se_blank_encoded_domain.data.json'
+        data_resource = 'data_for_damd.json'
+        if save_temp:  # save encoded data
+            # encoded: no sos, se_encoded: sos and eos
+            encoded_file = os.path.join(data_dir, data_processed)
+
+            if os.path.exists(encoded_file):
+                logger.info(
+                    'Reading encoded data from {}'.format(encoded_file))
+                self.data = json.loads(
+                    open(
+                        os.path.join(data_dir, data_resource),
+                        'r',
+                        encoding='utf-8').read().lower())
+                encoded_data = json.loads(
+                    open(encoded_file, 'r', encoding='utf-8').read())
+                self.train = encoded_data['train']
+                self.dev = encoded_data['dev']
+                self.test = encoded_data['test']
+            else:
+                logger.info(
+                    'Encoding data now and save the encoded data in {}'.format(
+                        encoded_file))
+                # not exists, encode data and save
+                self.train, self.dev, self.test = load_data_from_resource(
+                    data_resource)
+                # save encoded data
+                encoded_data = {
+                    'train': self.train,
+                    'dev': self.dev,
+                    'test': self.test
+                }
+                json.dump(encoded_data, open(encoded_file, 'w'), indent=2)
+        else:  # directly read processed data and encode
+            self.train, self.dev, self.test = load_data_from_resource(
+                data_resource)
+
+        random.seed(10)
+        random.shuffle(self.train)
+        logger.info('train size:{}, dev size:{}, test size:{}'.format(
+            len(self.train), len(self.dev), len(self.test)))
+
     def _get_convert_str(self, sent):
         assert isinstance(sent, str)
         return ' '.join([
@@ -491,14 +572,65 @@ class MultiWOZBPETextField(BPETextField):
             for tok in sent.split()
         ])
 
+    def _get_encoded_data(self, fn, dial):
+        encoded_dial = []
+        for idx, t in enumerate(dial['log']):  # tokenize to list of ids
+            enc = {}
+            enc['dial_id'] = fn
+
+            enc_info_list = [
+                ('user', self.sos_u_id, 'user', self.eos_u_id),
+                ('usdx', self.sos_u_id, 'user', self.eos_u_id),
+                ('resp', self.sos_r_id, 'resp', self.eos_r_id),
+                ('bspn', self.sos_b_id, 'constraint', self.eos_b_id),
+                ('bsdx', self.sos_b_id, 'cons_delex', self.eos_b_id),
+                ('aspn', self.sos_a_id, 'sys_act', self.eos_a_id)
+            ]
+            for enc_key, start_token, item_key, end_token in enc_info_list:
+                enc[enc_key] = [
+                    start_token
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(t[item_key]))) + [end_token]
+
+            enc['turn_num'] = t['turn_num']
+
+            if idx > 0 and t['turn_domain'] == '[general]':
+                enc['dspn'] = encoded_dial[idx - 1]['dspn']
+                enc['pointer'] = encoded_dial[idx - 1]['pointer'][:4] + [
+                    int(i) for i in t['pointer'].split(',')
+                ][-2:]
+                enc['turn_domain'] = encoded_dial[idx - 1]['turn_domain']
+                enc['db'] = encoded_dial[idx - 1]['db']
+            else:
+                if t['turn_domain'] == '[general]':
+                    assert not t['constraint'], f'{fn}-{idx}'
+                enc['dspn'] = [
+                    self.sos_d_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(
+                            t['turn_domain']))) + [self.eos_d_id]
+                enc['pointer'] = [int(i) for i in t['pointer'].split(',')]
+                enc['turn_domain'] = t['turn_domain'].split()
+                db_pointer = self.bspan_to_DBpointer(t['constraint'],
+                                                     t['turn_domain'].split())
+                enc['db'] = [
+                    self.sos_db_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(db_pointer))) + [self.eos_db_id]
+
+            encoded_dial.append(enc)
+        return encoded_dial
+
     def bspan_to_DBpointer(self, bspan, turn_domain):
         constraint_dict = self.bspan_to_constraint_dict(bspan)
-        # print(constraint_dict)
         matnums = self.db.get_match_num(constraint_dict)
         match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1]
         match_dom = match_dom[1:-1] if match_dom.startswith('[') else match_dom
         match = matnums[match_dom]
-        # vector = self.db.addDBPointer(match_dom, match)
+
         vector = self.db.addDBIndicator(match_dom, match)
         return vector
 
@@ -691,3 +823,67 @@ class MultiWOZBPETextField(BPETextField):
                 inputs['labels'] = [context]  # use previous turn
 
         return inputs, prompt_id
+
+    def restore(self, resp, domain, constraint_dict, mat_ents):
+        restored = resp
+
+        restored = restored.replace('[value_reference]', '53022')
+        restored = restored.replace('[value_car]', 'BMW')
+
+        for d in domain:
+            constraint = constraint_dict.get(d, None)
+            if constraint:
+                replace_res_list = [('stay', '[value_stay]'),
+                                    ('day', '[value_day]'),
+                                    ('people', '[value_people]'),
+                                    ('time', '[value_time]'),
+                                    ('type', '[value_type]')]
+                for key, value_key in replace_res_list:
+                    if key in constraint:
+                        restored = restored.replace(value_key, constraint[key])
+
+                if d in mat_ents and len(mat_ents[d]) == 0:
+                    for s in constraint:
+                        if s == 'pricerange' and d in [
+                                'hotel', 'restaurant'
+                        ] and 'price]' in restored:
+                            restored = restored.replace(
+                                '[value_price]', constraint['pricerange'])
+                        if s + ']' in restored:
+                            restored = restored.replace(
+                                '[value_%s]' % s, constraint[s])
+
+            if '[value_choice' in restored and mat_ents.get(d):
+                restored = restored.replace('[value_choice]',
+                                            str(len(mat_ents[d])))
+        if '[value_choice' in restored:
+            restored = restored.replace('[value_choice]', '3')
+
+        try:
+            ent = mat_ents.get(domain[-1], [])
+            if ent:
+                ent = ent[0]
+
+                for t in restored.split():
+                    if '[value' in t:
+                        slot = t[7:-1]
+                        if ent.get(slot):
+                            if domain[-1] == 'hotel' and slot == 'price':
+                                slot = 'pricerange'
+                            restored = restored.replace(t, ent[slot])
+                        elif slot == 'price':
+                            if ent.get('pricerange'):
+                                restored = restored.replace(
+                                    t, ent['pricerange'])
+                            else:
+                                logger.info(restored, domain)
+        except Exception:
+            logger.error(resp)
+            logger.error(restored)
+            quit()
+
+        restored = restored.replace('[value_phone]', '62781111')
+        restored = restored.replace('[value_postcode]', 'CG9566')
+        restored = restored.replace('[value_address]', 'Parkside, Cambridge')
+
+        return restored
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
new file mode 100644
index 00000000..6bdd8a3a
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -0,0 +1,130 @@
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.fields.gen_field import \
+    MultiWOZBPETextField
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator
+from modelscope.trainers.nlp.space.trainer.gen_trainer import MultiWOZTrainer
+from modelscope.utils.config import Config, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def setup_seed(seed: int):
+    import random
+    import torch
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_modeling_trainer)
+class DialogModelingTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # set reader and evaluator
+        self.bpe = MultiWOZBPETextField(self.cfg, **kwargs)
+
+        self.cfg.Model.num_token_embeddings = self.bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1
+
+        if 'work_dir' in kwargs:
+            self.cfg.Trainer.save_dir = kwargs['work_dir']
+        else:
+            self.cfg.Trainer.save_dir = './default_save_dir'
+
+        # set data and data status
+        self.train_data = self.bpe.get_batches('train')
+        self.dev_data = self.bpe.get_batches('dev')
+
+        self.evaluator = MultiWOZEvaluator(reader=self.bpe, **kwargs)
+        # set generator
+        self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe)
+        self._load_model(**kwargs)
+
+    def _load_model(self, **kwargs):
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda(
+            ) if self.cfg.use_gpu and torch.cuda.is_available() else array
+
+        # construct model
+        if 'model' in kwargs:
+            self.model = kwargs['model']
+        else:
+            self.model = SpaceModelBase.create(
+                kwargs['model_dir'],
+                self.cfg,
+                reader=self.bpe,
+                generator=self.generator)
+
+        import torch
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = MultiWOZTrainer(
+            self.model,
+            to_tensor,
+            self.cfg,
+            reader=self.bpe,
+            evaluator=self.evaluator)
+        self.trainer.set_optimizers()
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(train_data=self.train_data, dev_data=self.dev_data)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.cfg.do_infer = True
+
+        # get best checkpoint path
+        pos = checkpoint_path.rfind('/')
+        checkpoint_name = checkpoint_path[pos + 1:]
+        checkpoint_dir = checkpoint_path[:pos]
+
+        assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE
+        kwargs['model_dir'] = checkpoint_dir
+        self._load_model(**kwargs)
+        self.trainer.infer(data_type='test')
diff --git a/modelscope/trainers/nlp/space/eval.py b/modelscope/trainers/nlp/space/eval.py
new file mode 100644
index 00000000..f315ff07
--- /dev/null
+++ b/modelscope/trainers/nlp/space/eval.py
@@ -0,0 +1,952 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright from https://github.com/thu-spmi/LABES
+# Copyright from https://github.com/TonyNemo/UBAR-MultiWOZ
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections import Counter
+
+import json
+import numpy as np
+from nltk.util import ngrams
+from sklearn.metrics import f1_score
+
+from modelscope.utils.nlp.space import ontology, utils
+from modelscope.utils.nlp.space.clean_dataset import clean_slot_values
+
+
+def similar(a, b):
+    return a == b or a in b or b in a or a.split()[0] == b.split(
+    )[0] or a.split()[-1] == b.split()[-1]
+
+
+def setsub(a, b):
+    junks_a = []
+    useless_constraint = [
+        'temperature', 'week', 'est ', 'quick', 'reminder', 'near'
+    ]
+    for i in a:
+        flg = False
+        for j in b:
+            if similar(i, j):
+                flg = True
+        if not flg:
+            junks_a.append(i)
+    for junk in junks_a:
+        flg = False
+        for item in useless_constraint:
+            if item in junk:
+                flg = True
+        if not flg:
+            return False
+    return True
+
+
+def setsim(a, b):
+    a, b = set(a), set(b)
+    return setsub(a, b) and setsub(b, a)
+
+
+def DA_evaluate(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    results = {}
+
+    for avg_name in ['micro']:
+        my_f1_score = f1_score(y_true=labels, y_pred=preds, average=avg_name)
+        results['f1_{}'.format(avg_name)] = my_f1_score
+
+    return results
+
+
+class BLEUScorer(object):
+    # BLEU score calculator via GentScorer interface
+    # it calculates the BLEU-4 by taking the entire corpus in
+    # Calulate based multiple candidates against multiple references
+    def __init__(self):
+        pass
+
+    def score(self, parallel_corpus):
+
+        # containers
+        count = [0, 0, 0, 0]
+        clip_count = [0, 0, 0, 0]
+        r = 0
+        c = 0
+        weights = [0.25, 0.25, 0.25, 0.25]
+
+        # accumulate ngram statistics
+        for hyps, refs in parallel_corpus:
+            hyps = [hyp.split() for hyp in hyps]
+            refs = [ref.split() for ref in refs]
+            for hyp in hyps:
+
+                for i in range(4):
+                    # accumulate ngram counts
+                    hypcnts = Counter(ngrams(hyp, i + 1))
+                    cnt = sum(hypcnts.values())
+                    count[i] += cnt
+
+                    # compute clipped counts
+                    max_counts = {}
+                    for ref in refs:
+                        refcnts = Counter(ngrams(ref, i + 1))
+                        for ng in hypcnts:
+                            max_counts[ng] = max(
+                                max_counts.get(ng, 0), refcnts[ng])
+                    clipcnt = \
+                        dict((ng, min(count, max_counts[ng])) for ng, count in hypcnts.items())
+                    clip_count[i] += sum(clipcnt.values())
+
+                # accumulate r & c
+                bestmatch = [1000, 1000]
+                for ref in refs:
+                    if bestmatch[0] == 0:
+                        break
+                    diff = abs(len(ref) - len(hyp))
+                    if diff < bestmatch[0]:
+                        bestmatch[0] = diff
+                        bestmatch[1] = len(ref)
+                r += bestmatch[1]
+                c += len(hyp)
+
+        # computing bleu score
+        p0 = 1e-7
+        bp = \
+            1 if c > r else math.exp(1 - float(r) / float(c))
+        p_ns = \
+            [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
+        s = \
+            math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
+        bleu = bp * math.exp(s)
+        return bleu * 100
+
+
+""""
+For the data preparation and evaluation on MultiWOZ2.0/2.1,
+we refer to the code of UBAR (https://github.com/TonyNemo/UBAR-MultiWOZ)
+"""
+
+
+class MultiWOZEvaluator(object):
+
+    def __init__(self, reader, **kwargs):
+        self.reader = reader
+        self.domains = ontology.all_domains
+        self.all_data = self.reader.data
+        self.test_data = self.reader.test
+
+        self.bleu_scorer = BLEUScorer()
+
+        self.all_info_slot = []
+        for d, s_list in ontology.informable_slots.items():
+            for s in s_list:
+                self.all_info_slot.append(d + '-' + s)
+
+        # only evaluate these slots for dialog success
+        self.requestables = ['phone', 'address', 'postcode', 'reference', 'id']
+        self.db_dir = kwargs['data_dir']
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def validation_metric(self, data, fout=None):
+        bleu = self.bleu_metric(data)
+        # accu_single_dom, accu_multi_dom, multi_dom_num = self.domain_eval(data)
+        success, match, req_offer_counts, dial_num = \
+            self.context_to_response_eval(data, same_eval_as_cambridge=True, fout=fout)
+        return bleu, success, match
+
+    def bleu_metric(self, data, eval_dial_list=None):
+        gen, truth = [], []
+        for row in data:
+            if eval_dial_list and row[
+                    'dial_id'] + '.json' not in eval_dial_list:
+                continue
+            gen.append(row['resp_gen'])
+            truth.append(row['resp'])
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        if gen and truth:
+            try:
+                sc = self.bleu_scorer.score(zip(wrap_generated, wrap_truth))
+            except Exception:
+                sc = 0.0
+        else:
+            sc = 0.0
+        return sc
+
+    def context_to_response_eval(self,
+                                 data,
+                                 eval_dial_list=None,
+                                 same_eval_as_cambridge=False,
+                                 fout=None):
+        dials = self.pack_dial(data)
+        counts = {}
+        for req in self.requestables:
+            counts[req + '_total'] = 0
+            counts[req + '_offer'] = 0
+
+        dial_num, successes, matches = 0, 0, 0
+
+        for dial_id in dials:
+            if eval_dial_list and dial_id + '.json' not in eval_dial_list:
+                continue
+            dial = dials[dial_id]
+            reqs = {}
+            goal = {}
+            if '.json' not in dial_id and '.json' in list(
+                    self.all_data.keys())[0]:
+                dial_id = dial_id + '.json'
+            for domain in ontology.all_domains:
+                if self.all_data[dial_id]['goal'].get(domain):
+                    true_goal = self.all_data[dial_id]['goal']
+                    goal = self._parseGoal(goal, true_goal, domain)
+
+            for domain in goal.keys():
+                reqs[domain] = goal[domain]['requestable']
+
+            success, match, stats, counts = \
+                self._evaluateGeneratedDialogue(dial, goal, reqs, counts,
+                                                same_eval_as_cambridge=same_eval_as_cambridge, fout=fout)
+
+            successes += success
+            matches += match
+            dial_num += 1
+
+        succ_rate = successes / (float(dial_num) + 1e-10) * 100
+        match_rate = matches / (float(dial_num) + 1e-10) * 100
+        return succ_rate, match_rate, counts, dial_num
+
+    def _evaluateGeneratedDialogue(self,
+                                   dialog,
+                                   goal,
+                                   real_requestables,
+                                   counts,
+                                   soft_acc=False,
+                                   same_eval_as_cambridge=False,
+                                   fout=None):
+        """Evaluates the dialogue created by the model.
+            First we load the user goal of the dialogue, then for each turn
+            generated by the system we look for key-words.
+            For the Inform rate we look whether the entity was proposed.
+            For the Success rate we look for requestables slots"""
+        # for computing corpus success
+        requestables = self.requestables
+
+        # CHECK IF MATCH HAPPENED
+        provided_requestables = {}
+        venue_offered = {}
+        domains_in_goal = []
+        log = []
+        bspans = {}
+
+        for domain in goal.keys():
+            venue_offered[domain] = []
+            provided_requestables[domain] = []
+            domains_in_goal.append(domain)
+
+        for t, turn in enumerate(dialog):
+            if t == 0:
+                continue
+            if fout is not None:
+                log.append({
+                    'turn_num': turn['turn_num'],
+                    'turn_domain': turn['dspn'],
+                    'user': turn['user'],
+                    'aspn': turn['aspn'],
+                    'aspn_gen': turn['aspn_gen'],
+                    'resp': turn['resp'],
+                    'resp_gen': turn['resp_gen'],
+                    'pointer': turn['pointer'],
+                })
+
+            sent_t = turn['resp_gen']
+
+            for domain in goal.keys():
+                # for computing success
+                if same_eval_as_cambridge:
+                    # [restaurant_name], [hotel_name] instead of [value_name]
+                    if self.reader.use_true_domain_for_ctr_eval:
+                        dom_pred = [d[1:-1] for d in turn['dspn'].split()]
+                    else:
+                        dom_pred = [d[1:-1] for d in turn['dspn_gen'].split()]
+
+                    if domain not in dom_pred:  # fail
+                        continue
+                if '[value_name]' in sent_t or '[value_id]' in sent_t:
+                    if domain in [
+                            'restaurant', 'hotel', 'attraction', 'train'
+                    ]:
+                        # HERE YOU CAN PUT YOUR BELIEF STATE ESTIMATION
+                        if not self.reader.use_true_curr_bspn and not self.reader.use_true_bspn_for_ctr_eval:
+                            bspn = turn['bspn_gen']
+                        else:
+                            bspn = turn['bspn']
+
+                        constraint_dict = self.reader.bspan_to_constraint_dict(
+                            bspn)
+                        if constraint_dict.get(domain):
+                            venues = self.reader.db.queryJsons(
+                                domain,
+                                constraint_dict[domain],
+                                return_name=True)
+                        else:
+                            venues = []
+
+                        if len(venue_offered[domain]) == 0 and venues:
+
+                            venue_offered[domain] = venues
+                            bspans[domain] = constraint_dict[domain]
+                        else:
+                            flag = False
+                            for ven in venues:
+                                if ven not in venue_offered[domain]:
+                                    flag = True
+                                    break
+                            if flag and venues:  # sometimes there are no results so sample won't work
+                                venue_offered[domain] = venues
+                                bspans[domain] = constraint_dict[domain]
+                    else:  # not limited so we can provide one
+                        venue_offered[domain] = '[value_name]'
+
+                # ATTENTION: assumption here - we didn't provide phone or address twice! etc
+                for requestable in requestables:
+                    if requestable == 'reference':
+                        if '[value_reference]' in sent_t:
+                            if domain in ['restaurant', 'hotel', 'train']:
+                                if 'booked' in turn['pointer'] or 'ok' in turn[
+                                        'pointer'] or '[value_reference]' in turn[
+                                            'resp']:
+                                    # if pointer was allowing for that?
+                                    provided_requestables[domain].append(
+                                        'reference')
+                            else:
+                                provided_requestables[domain].append(
+                                    'reference')
+                    else:
+                        if '[value_' + requestable + ']' in sent_t:
+                            provided_requestables[domain].append(requestable)
+
+        # if name was given in the task
+        for domain in goal.keys():
+            # if name was provided for the user, the match is being done automatically
+            if 'name' in goal[domain]['informable']:
+                venue_offered[domain] = '[value_name]'
+
+            # special domains - entity does not need to be provided
+            if domain in ['taxi', 'police', 'hospital']:
+                venue_offered[domain] = '[value_name]'
+
+            if domain == 'train':
+                if not venue_offered[domain] and 'id' not in goal[domain][
+                        'requestable']:
+                    venue_offered[domain] = '[value_name]'
+        """
+        Given all inform and requestable slots
+        we go through each domain from the user goal
+        and check whether right entity was provided and
+        all requestable slots were given to the user.
+        The dialogue is successful if that's the case for all domains.
+        """
+        # HARD EVAL
+        stats = {
+            'restaurant': [0, 0, 0],
+            'hotel': [0, 0, 0],
+            'attraction': [0, 0, 0],
+            'train': [0, 0, 0],
+            'taxi': [0, 0, 0],
+            'hospital': [0, 0, 0],
+            'police': [0, 0, 0]
+        }
+
+        match = 0
+        success = 0
+        # MATCH
+        for domain in goal.keys():
+            match_stat = 0
+            if domain in ['restaurant', 'hotel', 'attraction', 'train']:
+                goal_venues = self.reader.db.queryJsons(
+                    domain, goal[domain]['informable'], return_name=True)
+                if type(venue_offered[domain]
+                        ) is str and '_name' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+                elif len(venue_offered[domain]) > 0 and len(
+                        set(venue_offered[domain]) & set(goal_venues)) > 0:
+                    match += 1
+                    match_stat = 1
+            else:
+                if '_name]' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+
+            stats[domain][0] = match_stat
+            stats[domain][2] = 1
+
+        if soft_acc:
+            match = float(match) / len(goal.keys())
+        else:
+            if match == len(goal.keys()):
+                match = 1.0
+            else:
+                match = 0.0
+
+        for domain in domains_in_goal:
+            for request in real_requestables[domain]:
+                counts[request + '_total'] += 1
+                if request in provided_requestables[domain]:
+                    counts[request + '_offer'] += 1
+
+        # SUCCESS
+        if fout is not None:
+            for domain in domains_in_goal:
+                success_stat = 0
+                domain_success = 0
+                if len(real_requestables[domain]) == 0:
+                    success += 1
+                    success_stat = 1
+                    stats[domain][1] = success_stat
+                    continue
+                # if values in sentences are super set of requestables
+                for request in real_requestables[domain]:
+                    if request in provided_requestables[domain]:
+                        domain_success += 1
+
+                if domain_success == len(real_requestables[domain]):
+                    success += 1
+                    success_stat = 1
+
+                stats[domain][1] = success_stat
+
+            # final eval
+            if soft_acc:
+                success = float(success) / len(real_requestables)
+            else:
+                if success >= len(real_requestables):
+                    success = 1
+                else:
+                    success = 0
+        else:
+            if match == 1.0:
+                for domain in domains_in_goal:
+                    success_stat = 0
+                    domain_success = 0
+                    if len(real_requestables[domain]) == 0:
+                        success += 1
+                        success_stat = 1
+                        stats[domain][1] = success_stat
+                        continue
+                    # if values in sentences are super set of requestables
+                    for request in real_requestables[domain]:
+                        if request in provided_requestables[domain]:
+                            domain_success += 1
+
+                    if domain_success == len(real_requestables[domain]):
+                        success += 1
+                        success_stat = 1
+
+                    stats[domain][1] = success_stat
+
+                # final eval
+                if soft_acc:
+                    success = float(success) / len(real_requestables)
+                else:
+                    if success >= len(real_requestables):
+                        success = 1
+                    else:
+                        success = 0
+
+        if fout is not None and success == 0:
+            sample = {
+                dialog[0]['dial_id']: {
+                    'log': log,
+                    'real_requestables': real_requestables,
+                    'provided_requestables': provided_requestables
+                }
+            }
+            line = json.dumps(sample)
+            fout.write(line)
+            fout.write('\n')
+
+        return success, match, stats, counts
+
+    def _parseGoal(self, goal, true_goal, domain):
+        """Parses user goal into dictionary format."""
+        goal[domain] = {}
+        goal[domain] = {'informable': {}, 'requestable': [], 'booking': []}
+        if 'info' in true_goal[domain]:
+            if domain == 'train':
+                # we consider dialogues only where train had to be booked!
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+                if 'reqt' in true_goal[domain]:
+                    if 'id' in true_goal[domain]['reqt']:
+                        goal[domain]['requestable'].append('id')
+            else:
+                if 'reqt' in true_goal[domain]:
+                    for s in true_goal[domain]['reqt']:  # addtional requests:
+                        if s in [
+                                'phone', 'address', 'postcode', 'reference',
+                                'id'
+                        ]:
+                            # ones that can be easily delexicalized
+                            goal[domain]['requestable'].append(s)
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+
+            for s, v in true_goal[domain]['info'].items():
+                s_, v_ = clean_slot_values(self.db_dir, domain, s, v)
+                if len(v_.split()) > 1:
+                    v_ = ' '.join(
+                        [token.text for token in self.reader.nlp(v_)]).strip()
+                goal[domain]['informable'][s_] = v_
+
+            if 'book' in true_goal[domain]:
+                goal[domain]['booking'] = true_goal[domain]['book']
+        return goal
+
+
+class GenericEvaluator:
+
+    def __init__(self, reader):
+        self.reader = reader
+        self.metric_dict = {}
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def run_metrics(self, results):
+        raise ValueError('Please specify the evaluator first')
+
+    def bleu_metric(self, data, type='bleu'):
+        gen, truth = [], []
+        for row in data:
+            gen.append(self.clean(row['resp_gen']))
+            # gen.append(self.clean(row['resp']))
+            truth.append(self.clean(row['resp']))
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        sc = BLEUScorer().score(zip(wrap_generated, wrap_truth))
+        return sc
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            if ignore_dontcare and v == 'dontcare':
+                continue
+            if intersection and v != 'dontcare' and v not in self.entities_flat:
+                continue
+
+            normalized[s] = v
+
+        return normalized
+
+    def _normalize_act(self, aspn, intersection=False):
+        aspn_list = aspn.split('|')
+        normalized = {}
+        for i, v in enumerate(aspn_list):
+            seq = v.strip()
+            word_set = set()
+            for w in seq.split():
+                if intersection:
+                    if self.reader.act_order[i] == 'av':
+                        if '[value' in w:
+                            word_set.add(w)
+                    else:
+                        if w in self.requestable_slots:
+                            word_set.add(w)
+                else:
+                    word_set.add(w)
+            normalized[self.reader.act_order[i]] = word_set
+        return normalized
+
+    def tracker_metric(self, data, normalize=True):
+        # turn level metric
+        tp, fp, fn, db_correct = 0, 0, 0, 0
+        goal_accr, slot_accr, total = 0, {}, 1e-8
+        for s in self.informable_slots:
+            slot_accr[s] = 0
+
+        for row in data:
+            if normalize:
+                gen = self._normalize_constraint(row['bspn_gen'])
+                truth = self._normalize_constraint(row['bspn'])
+            else:
+                gen = self._normalize_constraint(
+                    row['bspn_gen'], intersection=False)
+                truth = self._normalize_constraint(
+                    row['bspn'], intersection=False)
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                for slot, value in gen.items():
+                    if value in truth[slot]:
+                        tp += 1
+                    else:
+                        fp += 1
+                for slot, value in truth.items():
+                    if value not in gen[slot]:
+                        fn += 1
+
+            if truth and valid:
+                total += 1
+                for s in self.informable_slots:
+                    if gen[s] == truth[s]:
+                        slot_accr[s] += 1
+                if gen == truth:
+                    goal_accr += 1
+                if row.get('db_gen') and row.get('db_match'):
+                    if row['db_gen'] == row['db_match']:
+                        db_correct += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        goal_accr /= total
+        db_correct /= total
+        for s in slot_accr:
+            slot_accr[s] /= total
+        return precision, recall, f1, goal_accr, slot_accr, db_correct
+
+    def request_metric(self, data):
+        # dialog level metric
+        dials = self.pack_dial(data)
+        tp, fp, fn = 0, 0, 0
+        for dial_id in dials:
+            truth_req, gen_req = set(), set()
+            dial = dials[dial_id]
+            for turn_num, turn in enumerate(dial):
+                resp_gen_token = self.clean(turn['resp_gen']).split()
+                resp_token = self.clean(turn['resp']).split()
+                for w in resp_gen_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        gen_req.add(w[1:-1].split('_')[1])
+                for w in resp_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        truth_req.add(w[1:-1].split('_')[1])
+            for req in gen_req:
+                if req in truth_req:
+                    tp += 1
+                else:
+                    fp += 1
+            for req in truth_req:
+                if req not in gen_req:
+                    fn += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        return f1, precision, recall
+
+    def act_metric(self, data):
+        # turn level metric
+        tp, fp, fn = {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }
+        for s in self.requestable_slots:
+            tp[s], fp[s], fn[s] = 0, 0, 0
+            tp['[value_%s]' % s], fp['[value_%s]' % s], fn['[value_%s]'
+                                                           % s] = 0, 0, 0
+
+        for row in data:
+            gen = self._normalize_act(row['aspn_gen'])
+            truth = self._normalize_act(row['aspn'])
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                # how well the act decoder captures user's requests
+                for value in gen['av']:
+                    if value in truth['av']:
+                        tp['all_v'] += 1
+                        if tp.get(value):
+                            tp[value] += 1
+                    else:
+                        fp['all_v'] += 1
+                        if fp.get(value):
+                            fp[value] += 1
+                for value in truth['av']:
+                    if value not in gen['av']:
+                        fn['all_v'] += 1
+                        if fn.get(value):
+                            fn[value] += 1
+
+                # how accurately the act decoder predicts system's question
+                if 'as' not in gen:
+                    continue
+                for slot in gen['as']:
+                    if slot in truth['as']:
+                        tp['all_s'] += 1
+                        if tp.get(slot):
+                            tp[slot] += 1
+                    else:
+                        fp['all_s'] += 1
+                        if fp.get(slot):
+                            fp[slot] += 1
+                for slot in truth['as']:
+                    if slot not in gen['as']:
+                        fn['all_s'] += 1
+                        if fn.get(slot):
+                            fn[slot] += 1
+
+        result = {}
+        for k, v in tp.items():
+            precision, recall = tp[k] / (tp[k] + fp[k] + 1e-8), tp[k] / (
+                tp[k] + fn[k] + 1e-8)
+            f1 = 2 * precision * recall / (precision + recall + 1e-8)
+            result[k] = [f1, precision, recall]
+        return result
+
+
+"""
+For the data preparation and evaluation on In-Car Assistant/CamRest,
+we refer to the code of LABES (https://github.com/thu-spmi/LABES)
+"""
+
+
+class CamRestEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(results)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+        raw_entities = json.loads(open(entity_path).read().lower())
+        for s in raw_entities['informable']:
+            entities_flat.extend(raw_entities['informable'][s])
+            for v in raw_entities['informable'][s]:
+                entitiy_to_slot_dict[v] = s
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {'1': '', '2': '', '3': ''}, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+            if list(truth_cons.values()) != ['', '', '']:
+                if gen_cons == truth_cons:
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
+
+
+class KvretEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(
+            results, normalize=True)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        junk = [
+            'good', 'great', 'quickest', 'shortest', 'route', 'week',
+            'fastest', 'nearest', 'next', 'closest', 'way', 'mile', 'activity',
+            'restaurant', 'appointment'
+        ]
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            for j in junk:
+                v = ' '.join(v.replace(j, '').split())
+            if intersection and v not in self.entities_flat:
+                continue
+
+            if s in self.informable_slots:
+                normalized[s] = v
+            else:
+                # TODO only use slot (not domain) in s for matching !!!
+                pass
+
+        return normalized
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+
+        entitiy_to_slot_dict = self.reader.entity_dict
+        for s in entitiy_to_slot_dict:
+            if s not in entities_flat:
+                entities_flat.append(s)
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {
+                '1': '',
+                '2': '',
+                '3': '',
+                '4': '',
+                '5': '',
+                '6': '',
+                '7': '',
+                '8': '',
+                '9': '',
+                '10': '',
+                '11': ''
+            }, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+
+            if list(truth_cons.values()) != [''] * 11:
+                gen_cons = [x for x in gen_cons.values() if x]
+                truth_cons = [x for x in truth_cons.values() if x]
+                if self.constraint_same(gen_cons, truth_cons):
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
index aa28d798..34cd2f9b 100644
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -15,27 +15,11 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
     MetricsTracker
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology
 
 
-def get_logger(log_path, name='default'):
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(message)s')
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setFormatter(formatter)
-    logger.addHandler(sh)
-
-    fh = logging.FileHandler(log_path, mode='w')
-    fh.setFormatter(formatter)
-    logger.addHandler(fh)
-
-    return logger
-
-
 class Trainer(object):
 
     def __init__(self,
@@ -51,15 +35,16 @@ class Trainer(object):
 
         self.do_train = config.do_train
         self.do_infer = config.do_infer
-        self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
-            0] == '-'
-        self.valid_metric_name = config.Trainer.valid_metric_name[1:]
-        self.num_epochs = config.Trainer.num_epochs
-        # self.save_dir = config.Trainer.save_dir
-        self.log_steps = config.Trainer.log_steps
-        self.valid_steps = config.Trainer.valid_steps
-        self.save_checkpoint = config.Trainer.save_checkpoint
-        self.save_summary = config.Trainer.save_summary
+        if self.do_train:
+            self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
+                0] == '-'
+            self.valid_metric_name = config.Trainer.valid_metric_name[1:]
+            self.num_epochs = config.Trainer.num_epochs
+            self.save_dir = config.Trainer.save_dir
+            self.log_steps = config.Trainer.log_steps
+            self.valid_steps = config.Trainer.valid_steps
+            self.save_checkpoint = config.Trainer.save_checkpoint
+            self.save_summary = config.Trainer.save_summary
         self.lr = config.Model.lr
         self.weight_decay = config.Model.weight_decay
         self.batch_size = config.Trainer.batch_size
@@ -71,22 +56,21 @@ class Trainer(object):
         self.optimizer = optimizer
 
         self.model = model
-        self.func_model = self.model.module if self.gpu > 1 else self.model
+        self.func_model = self.model.module if self.gpu > 1 and config.use_gpu else self.model
         self.reader = reader
         self.evaluator = evaluator
         self.tokenizer = reader.tokenizer
 
-        # if not os.path.exists(self.save_dir):
-        #     os.makedirs(self.save_dir)
-
-        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
-        self.logger = logger or get_logger('trainer.log', 'trainer')
+        self.logger = get_logger()
 
         self.batch_metrics_tracker = MetricsTracker()
         self.token_metrics_tracker = MetricsTracker()
 
-        self.best_valid_metric = float(
-            'inf' if self.is_decreased_valid_metric else '-inf')
+        if self.do_train:
+            if not os.path.exists(self.save_dir):
+                os.makedirs(self.save_dir)
+            self.best_valid_metric = float(
+                'inf' if self.is_decreased_valid_metric else '-inf')
         self.epoch = 0
 
     def decode_generated_bspn_resp(self, generated):
@@ -248,9 +232,12 @@ class Trainer(object):
 
         # Save current best model
         if is_best:
-            best_model_file = os.path.join(self.save_dir, 'best.model')
+            best_model_file = os.path.join(self.save_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             torch.save(self.model.state_dict(), best_model_file)
-            best_train_file = os.path.join(self.save_dir, 'best.train')
+            best_train_file = os.path.join(
+                self.save_dir,
+                '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE))
             torch.save(train_state, best_train_file)
             self.logger.info(
                 f"Saved best model state to '{best_model_file}' with new best valid metric "
@@ -324,8 +311,7 @@ class Trainer(object):
 
             self.func_model.load_state_dict(model_state_dict)
             self.logger.info(
-                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
-            )
+                f"Loaded model state from '{self.func_model.init_checkpoint}'")
 
         def _load_train_state():
             train_file = f'{self.func_model.init_checkpoint}.train'
@@ -558,19 +544,17 @@ class MultiWOZTrainer(Trainer):
                         generated_bs = outputs[0].cpu().numpy().tolist()
                         bspn_gen = self.decode_generated_bspn(generated_bs)
                         # check DB result
-                        if self.reader.use_true_db_pointer:  # To control whether current db is ground truth
+                        if self.reader.use_true_db_pointer:
                             db = turn['db']
                         else:
                             db_result = self.reader.bspan_to_DBpointer(
                                 self.tokenizer.decode(bspn_gen),
                                 turn['turn_domain'])
-                            assert len(turn['db']) == 4
-                            book_result = turn['db'][2]
+                            assert len(turn['db']) == 3
                             assert isinstance(db_result, str)
                             db = \
                                 [self.reader.sos_db_id] + \
                                 self.tokenizer.convert_tokens_to_ids([db_result]) + \
-                                [book_result] + \
                                 [self.reader.eos_db_id]
                             prompt_id = self.reader.sos_a_id
 
@@ -636,7 +620,7 @@ class MultiWOZTrainer(Trainer):
         score = 0.5 * (success + match) + bleu
 
         # log results
-        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' %\
+        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' % \
                           (match, success, bleu, score)
         message_prefix = f'[Infer][{self.epoch}]'
         time_cost = f'TIME-{time.time() - begin_time:.3f}'
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
new file mode 100644
index 00000000..4578ccc4
--- /dev/null
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -0,0 +1,333 @@
+import os
+import re
+
+from . import ontology
+
+
+def clean_text_split_dot(text):
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+    return text
+
+
+def clean_text(data_dir, text):
+    text = text.strip()
+    text = text.lower()
+    text = text.replace(u'’', "'")
+    text = text.replace(u'‘', "'")
+    text = text.replace(';', ',')
+    text = text.replace('"', ' ')
+    text = text.replace('/', ' and ')
+    text = text.replace("don't", "do n't")
+    text = clean_time(text)
+    baddata = {
+        r'c\.b (\d), (\d) ([a-z])\.([a-z])': r'cb\1\2\3\4',
+        'c.b. 1 7 d.y': 'cb17dy',
+        'c.b.1 7 d.y': 'cb17dy',
+        'c.b 25, 9 a.q': 'cb259aq',
+        'isc.b 25, 9 a.q': 'is cb259aq',
+        'c.b2, 1 u.f': 'cb21uf',
+        'c.b 1,2 q.a': 'cb12qa',
+        '0-122-336-5664': '01223365664',
+        'postcodecb21rs': 'postcode cb21rs',
+        r'i\.d': 'id',
+        ' i d ': 'id',
+        'Telephone:01223358966': 'Telephone: 01223358966',
+        'depature': 'departure',
+        'depearting': 'departing',
+        '-type': ' type',
+        r'b[\s]?&[\s]?b': 'bed and breakfast',
+        'b and b': 'bed and breakfast',
+        r'guesthouse[s]?': 'guest house',
+        r'swimmingpool[s]?': 'swimming pool',
+        "wo n\'t": 'will not',
+        " \'d ": ' would ',
+        " \'m ": ' am ',
+        " \'re' ": ' are ',
+        " \'ll' ": ' will ',
+        " \'ve ": ' have ',
+        r'^\'': '',
+        r'\'$': '',
+    }
+    for tmpl, good in baddata.items():
+        text = re.sub(tmpl, good, text)
+
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+
+    with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin:
+        for line in fin.readlines():
+            fromx, tox = line.replace('\n', '').split('\t')
+            text = ' ' + text + ' '
+            text = text.replace(' ' + fromx + ' ', ' ' + tox + ' ')[1:-1]
+
+    return text
+
+
+def clean_time(utter):
+    utter = re.sub(r'(\d+) ([ap]\.?m)', lambda x: x.group(1) + x.group(2),
+                   utter)  # 9 am -> 9am
+    utter = re.sub(r'((?<!\d)\d:\d+)(am)?', r'0\1', utter)
+    utter = re.sub(r'((?<!\d)\d)am', r'0\1:00', utter)
+    utter = re.sub(r'((?<!\d)\d)pm',
+                   lambda x: str(int(x.group(1)) + 12) + ':00', utter)
+    utter = re.sub(r'(\d+)(:\d+)pm',
+                   lambda x: str(int(x.group(1)) + 12) + x.group(2), utter)
+    utter = re.sub(r'(\d+)a\.?m', r'\1', utter)
+    return utter
+
+
+def clean_slot_values(data_dir, domain, slot, value):
+    value = clean_text(data_dir, value)
+    if not value:
+        value = ''
+    elif value == 'not mentioned':
+        value = ''
+        # value = 'not mentioned' # if in DST setting
+    elif domain == 'attraction':
+        if slot == 'name':
+            if value == 't':
+                value = ''
+            if value == 'trinity':
+                value = 'trinity college'
+        elif slot == 'area':
+            if value in ['town centre', 'cent', 'center', 'ce']:
+                value = 'centre'
+            elif value in [
+                    'ely', 'in town', 'museum', 'norwich', 'same area as hotel'
+            ]:
+                value = ''
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'type':
+            if value in ['m', 'mus', 'musuem']:
+                value = 'museum'
+            elif value in ['art', 'architectural']:
+                value = 'architecture'
+            elif value in ['churches']:
+                value = 'church'
+            elif value in ['coll']:
+                value = 'college'
+            elif value in ['concert', 'concerthall']:
+                value = 'concert hall'
+            elif value in ['night club']:
+                value = 'nightclub'
+            elif value in [
+                    'mutiple sports', 'mutliple sports', 'sports', 'galleria'
+            ]:
+                value = 'multiple sports'
+            elif value in ['ol', 'science', 'gastropub', 'la raza']:
+                value = ''
+            elif value in ['swimmingpool', 'pool']:
+                value = 'swimming pool'
+            elif value in ['fun']:
+                value = 'entertainment'
+
+    elif domain == 'hotel':
+        if slot == 'area':
+            if value in [
+                    'cen', 'centre of town', 'near city center', 'center'
+            ]:
+                value = 'centre'
+            elif value in ['east area', 'east side']:
+                value = 'east'
+            elif value in ['in the north', 'north part of town']:
+                value = 'north'
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot == 'name':
+            if value == 'uni':
+                value = 'university arms hotel'
+            elif value == 'university arms':
+                value = 'university arms hotel'
+            elif value == 'acron':
+                value = 'acorn guest house'
+            elif value == 'ashley':
+                value = 'ashley hotel'
+            elif value == 'arbury lodge guesthouse':
+                value = 'arbury lodge guest house'
+            elif value == 'la':
+                value = 'la margherit'
+            elif value == 'no':
+                value = ''
+        elif slot == 'internet':
+            if value == 'does not':
+                value = 'no'
+            elif value in ['y', 'free', 'free internet']:
+                value = 'yes'
+            elif value in ['4']:
+                value = ''
+        elif slot == 'parking':
+            if value == 'n':
+                value = 'no'
+            elif value in ['free parking']:
+                value = 'yes'
+            elif value in ['y']:
+                value = 'yes'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value == 'moderately':
+                value = 'moderate'
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['inexpensive']:
+                value = 'cheap'
+            elif value in ['2', '4']:
+                value = ''
+        elif slot == 'stars':
+            if value == 'two':
+                value = '2'
+            elif value == 'three':
+                value = '3'
+            elif value in [
+                    '4-star', '4 stars', '4 star', 'four star', 'four stars'
+            ]:
+                value = '4'
+        elif slot == 'type':
+            if value == '0 star rarting':
+                value = ''
+            elif value == 'guesthouse':
+                value = 'guest house'
+            elif value not in ['hotel', 'guest house', "do n't care"]:
+                value = ''
+    elif domain == 'restaurant':
+        if slot == 'area':
+            if value in [
+                    'center', 'scentre', 'center of town', 'city center',
+                    'cb30aq', 'town center', 'centre of cambridge',
+                    'city centre'
+            ]:
+                value = 'centre'
+            elif value == 'west part of town':
+                value = 'west'
+            elif value == 'n':
+                value = 'north'
+            elif value in ['the south']:
+                value = 'south'
+            elif value not in [
+                    'centre', 'south', "do n't care", 'west', 'east', 'north'
+            ]:
+                value = ''
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value in ['moderately', 'mode', 'mo']:
+                value = 'moderate'
+            elif value in ['not']:
+                value = ''
+            elif value in ['inexpensive', 'ch']:
+                value = 'cheap'
+        elif slot == 'food':
+            if value == 'barbecue':
+                value = 'barbeque'
+        elif slot == 'pricerange':
+            if value == 'moderately':
+                value = 'moderate'
+        elif slot == 'time':
+            if value == '9:00':
+                value = '09:00'
+            elif value == '9:45':
+                value = '09:45'
+            elif value == '1330':
+                value = '13:30'
+            elif value == '1430':
+                value = '14:30'
+            elif value == '9:15':
+                value = '09:15'
+            elif value == '9:30':
+                value = '09:30'
+            elif value == '1830':
+                value = '18:30'
+            elif value == '9':
+                value = '09:00'
+            elif value == '2:00':
+                value = '14:00'
+            elif value == '1:00':
+                value = '13:00'
+            elif value == '3:00':
+                value = '15:00'
+    elif domain == 'taxi':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1530':
+                value = '15:30'
+            elif value == '15 minutes':
+                value = ''
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '1:00':
+                value = '01:00'
+            elif value == '21:4':
+                value = '21:04'
+            elif value == '4:15':
+                value = '04:15'
+            elif value == '5:45':
+                value = '05:45'
+            elif value == '0700':
+                value = '07:00'
+            elif value == '4:45':
+                value = '04:45'
+            elif value == '8:30':
+                value = '08:30'
+            elif value == '9:30':
+                value = '09:30'
+            value = value.replace('.', ':')
+
+    elif domain == 'train':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1':
+                value = '01:00'
+            elif value in ['does not care', 'doesnt care', "doesn't care"]:
+                value = "do n't care"
+            elif value == '8:30':
+                value = '08:30'
+            elif value == 'not 15:45':
+                value = ''
+            value = value.replace('.', ':')
+        elif slot == 'day':
+            if value == 'doesnt care' or value == "doesn't care":
+                value = "do n't care"
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '2:30':
+                value = '02:30'
+            elif value == '7:54':
+                value = '07:54'
+            elif value == 'after 5:45 pm':
+                value = '17:45'
+            elif value in [
+                    'early evening', 'friday', 'sunday', 'tuesday', 'afternoon'
+            ]:
+                value = ''
+            elif value == '12':
+                value = '12:00'
+            elif value == '1030':
+                value = '10:30'
+            elif value == '1700':
+                value = '17:00'
+            elif value in [
+                    'does not care', 'doesnt care', 'do nt care',
+                    "doesn't care"
+            ]:
+                value = "do n't care"
+
+            value = value.replace('.', ':')
+    if value in ['dont care', "don't care", 'do nt care', "doesn't care"]:
+        value = "do n't care"
+    if ontology.normlize_slot_names.get(slot):
+        slot = ontology.normlize_slot_names[slot]
+    return slot, value
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index ef38684a..81d1b1c5 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -4,8 +4,11 @@ from collections import OrderedDict
 import json
 import numpy as np
 
+from modelscope.utils.logger import get_logger
 from . import ontology
 
+logger = get_logger()
+
 
 def max_lens(X):
     lens = [len(X)]
@@ -117,8 +120,8 @@ class MultiWOZVocab(object):
     def construct(self):
         freq_dict_sorted = sorted(
             self._freq_dict.keys(), key=lambda x: -self._freq_dict[x])
-        print('Vocabulary size including oov: %d' %
-              (len(freq_dict_sorted) + len(self._idx2word)))
+        logger.info('Vocabulary size including oov: %d' %
+                    (len(freq_dict_sorted) + len(self._idx2word)))
         if len(freq_dict_sorted) + len(self._idx2word) < self.vocab_size:
             logging.warning(
                 'actual label set smaller than that configured: {}/{}'.format(
@@ -148,8 +151,9 @@ class MultiWOZVocab(object):
         for w, idx in self._word2idx.items():
             self._idx2word[idx] = w
         self.vocab_size_oov = len(self._idx2word)
-        print('vocab file loaded from "' + vocab_path + '"')
-        print('Vocabulary size including oov: %d' % (self.vocab_size_oov))
+        logger.info('vocab file loaded from "' + vocab_path + '"')
+        logger.info('Vocabulary size including oov: %d' %
+                    (self.vocab_size_oov))
 
     def save_vocab(self, vocab_path):
         _freq_dict = OrderedDict(
diff --git a/tests/trainers/test_dialog_modeling_trainer.py b/tests/trainers/test_dialog_modeling_trainer.py
new file mode 100644
index 00000000..be03db30
--- /dev/null
+++ b/tests/trainers/test_dialog_modeling_trainer.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogModelingTrainer(unittest.TestCase):
+
+    model_id = 'damo/nlp_space_pretrained-dialog-model'
+    output_dir = './dialog_fintune_result'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        # download data set
+        data_multiwoz = MsDataset.load(
+            'MultiWoz2.0', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        data_dir = os.path.join(
+            data_multiwoz._hf_ds.config_kwargs['split_config']['train'],
+            'data')
+
+        # download model
+        model_dir = snapshot_download(self.model_id)
+
+        # dialog finetune config
+        def cfg_modify_fn(cfg):
+            config = {
+                'seed': 10,
+                'gpu': 4,
+                'use_data_distributed': False,
+                'valid_metric_name': '-loss',
+                'num_epochs': 60,
+                'save_dir': self.output_dir,
+                'token_loss': True,
+                'batch_size': 32,
+                'log_steps': 10,
+                'valid_steps': 0,
+                'save_checkpoint': True,
+                'save_summary': False,
+                'shuffle': True,
+                'sort_pool_size': 0
+            }
+
+            cfg.Trainer = config
+            cfg.use_gpu = torch.cuda.is_available() and config['gpu'] >= 1
+            return cfg
+
+        # trainer config
+        kwargs = dict(
+            model_dir=model_dir,
+            cfg_name='gen_train_config.json',
+            data_dir=data_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.dialog_modeling_trainer, default_args=kwargs)
+        trainer.train()
+        checkpoint_path = os.path.join(self.output_dir,
+                                       ModelFile.TORCH_MODEL_BIN_FILE)
+        assert os.path.exists(checkpoint_path)
+        trainer.evaluate(checkpoint_path=checkpoint_path)

From 937d3ca67bdd717c74be212d2002560a08ab69e5 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Fri, 9 Sep 2022 14:52:35 +0800
Subject: [PATCH 088/175] [to #42322933] bugs:circular dependency fixed/
 word_segmentation output

---
 .../conversational_text_to_sql_pipeline.py    |  4 +--
 .../nlp/word_segmentation_pipeline.py         |  2 +-
 modelscope/preprocessors/nlp.py               |  2 +-
 modelscope/preprocessors/star/__init__.py     |  3 +-
 .../preprocessors/star/fields/__init__.py     | 36 +++++++++++++++----
 modelscope/utils/nlp/__init__.py              | 22 ++++++++++++
 modelscope/utils/nlp/nlp_utils.py             | 19 ----------
 modelscope/utils/nlp/utils.py                 | 20 +++++++++++
 modelscope/utils/test_utils.py                |  2 +-
 9 files changed, 79 insertions(+), 31 deletions(-)
 create mode 100644 modelscope/utils/nlp/utils.py

diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index 399dad5a..c46e8c81 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -11,8 +11,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.process_dataset import process_tables
+from modelscope.preprocessors.star.fields import (SubPreprocessor,
+                                                  process_tables)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ConversationalTextToSqlPipeline']
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 66a5c524..9899243e 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -94,4 +94,4 @@ class WordSegmentationPipeline(Pipeline):
         if chunk:
             chunks.append(chunk)
         seg_result = ' '.join(chunks)
-        return {OutputKeys.OUTPUT: seg_result}
+        return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 84e7ca4d..9137b105 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -16,7 +16,7 @@ from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
-from modelscope.utils.nlp.nlp_utils import import_external_nltk_data
+from modelscope.utils.nlp import import_external_nltk_data
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/star/__init__.py
index 5a4bcea9..cef8f074 100644
--- a/modelscope/preprocessors/star/__init__.py
+++ b/modelscope/preprocessors/star/__init__.py
@@ -6,7 +6,8 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .conversational_text_to_sql_preprocessor import \
         ConversationalTextToSqlPreprocessor
-    from .fields import MultiWOZBPETextField, IntentBPETextField
+    from .fields import (get_label, SubPreprocessor, preprocess_dataset,
+                         process_dataset)
 
 else:
     _import_structure = {
diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/star/fields/__init__.py
index 1e95a998..7049c43b 100644
--- a/modelscope/preprocessors/star/fields/__init__.py
+++ b/modelscope/preprocessors/star/fields/__init__.py
@@ -1,6 +1,30 @@
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.parse import get_label
-from modelscope.preprocessors.star.fields.preprocess_dataset import \
-    preprocess_dataset
-from modelscope.preprocessors.star.fields.process_dataset import \
-    process_dataset
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .common_utils import SubPreprocessor
+    from .parse import get_label
+    from .preprocess_dataset import \
+        preprocess_dataset
+    from .process_dataset import \
+        process_dataset, process_tables
+
+else:
+    _import_structure = {
+        'common_utils': ['SubPreprocessor'],
+        'parse': ['get_label'],
+        'preprocess_dataset': ['preprocess_dataset'],
+        'process_dataset': ['process_dataset', 'process_tables'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/utils/nlp/__init__.py b/modelscope/utils/nlp/__init__.py
index e69de29b..62c0b888 100644
--- a/modelscope/utils/nlp/__init__.py
+++ b/modelscope/utils/nlp/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import import_external_nltk_data
+
+else:
+    _import_structure = {
+        'utils': ['import_external_nltk_data'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 64b12007..af539dda 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -42,22 +42,3 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
-
-
-def import_external_nltk_data(nltk_data_dir, package_name):
-    """import external nltk_data, and extract nltk zip package.
-
-    Args:
-        nltk_data_dir (str): external nltk_data dir path, eg. /home/xx/nltk_data
-        package_name (str): nltk package name, eg. tokenizers/punkt
-    """
-    import nltk
-    nltk.data.path.append(nltk_data_dir)
-
-    filepath = osp.join(nltk_data_dir, package_name + '.zip')
-    zippath = osp.join(nltk_data_dir, package_name)
-    packagepath = osp.dirname(zippath)
-    if not osp.exists(zippath):
-        import zipfile
-        with zipfile.ZipFile(filepath) as zf:
-            zf.extractall(osp.join(packagepath))
diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py
new file mode 100644
index 00000000..13a21480
--- /dev/null
+++ b/modelscope/utils/nlp/utils.py
@@ -0,0 +1,20 @@
+import os.path as osp
+
+
+def import_external_nltk_data(nltk_data_dir, package_name):
+    """import external nltk_data, and extract nltk zip package.
+
+    Args:
+        nltk_data_dir (str): external nltk_data dir path, eg. /home/xx/nltk_data
+        package_name (str): nltk package name, eg. tokenizers/punkt
+    """
+    import nltk
+    nltk.data.path.append(nltk_data_dir)
+
+    filepath = osp.join(nltk_data_dir, package_name + '.zip')
+    zippath = osp.join(nltk_data_dir, package_name)
+    packagepath = osp.dirname(zippath)
+    if not osp.exists(zippath):
+        import zipfile
+        with zipfile.ZipFile(filepath) as zf:
+            zf.extractall(osp.join(packagepath))
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 8fb621d3..5109db11 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -11,7 +11,7 @@ import sys
 import tarfile
 import tempfile
 import unittest
-from typing import OrderedDict
+from collections import OrderedDict
 
 import requests
 import torch

From a4cfbaa0ddb448ed8e3b33917e220ec13f7bef5b Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Fri, 9 Sep 2022 14:56:05 +0800
Subject: [PATCH 089/175] [to #42322933] revert mplug batch inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

由于之前实现的 batch 化 inference 与 pipelines/base.py 中输入 List[Input] 的情况存在冲突，移除了此处之前实现的 batch 化 inference 代码。mplug 模型在 pipeline 中推理时输入只接受 Image.Image，str，tuple，dict 类型，对于 List[Input] 的情况由 pipelines/base.py 中的代码进行处理
---
 .../models/multi_modal/mplug_for_all_tasks.py    | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index a06e5800..d61fea10 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -57,18 +57,14 @@ class MPlugForAllTasks(TorchModel):
             if task == Tasks.image_text_retrieval:
                 return {OutputKeys.SCORES: output[0].tolist()}
             topk_ids, _ = output
-            topk_ids = [topk_ids[i][0] for i in range(len(topk_ids))]
-            pred_strings: List[str] = \
-                self.tokenizer.batch_decode(topk_ids, skip_special_tokens=True)
-            output = []
-            for pred_string in pred_strings:
-                for _old, _new in replace_tokens_bert:
-                    pred_string = pred_string.replace(_old, _new)
-                pred_string = pred_string.strip()
-                output.append(pred_string)
+            pred_string: List[str] = \
+                self.tokenizer.decode(topk_ids[0][0])
+            for _old, _new in replace_tokens_bert:
+                pred_string = pred_string.replace(_old, _new)
+            pred_string = pred_string.strip()
             output_key = OutputKeys.CAPTION \
                 if task == Tasks.image_captioning else OutputKeys.TEXT
-            return {output_key: output}
+            return {output_key: pred_string}
 
         # train and evaluate
         import addict

From e0ef60ca9bea74bfe60f8ac6dc3eba35b85390a4 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 9 Sep 2022 14:56:15 +0800
Subject: [PATCH 090/175] [to #42322933] skip demo test by default

---
 tests/pipelines/test_automatic_speech_recognition.py     | 2 +-
 tests/pipelines/test_cmdssl_video_embedding.py           | 2 +-
 tests/pipelines/test_generative_multi_modal_embedding.py | 2 +-
 tests/pipelines/test_hicossl_video_embedding.py          | 2 +-
 tests/pipelines/test_image_colorization.py               | 2 +-
 tests/pipelines/test_image_denoise.py                    | 2 +-
 tests/pipelines/test_image_instance_segmentation.py      | 2 +-
 tests/pipelines/test_image_matting.py                    | 2 +-
 tests/pipelines/test_image_panoptic_segmentation.py      | 2 +-
 tests/pipelines/test_image_portrait_enhancement.py       | 2 +-
 tests/pipelines/test_image_reid_person.py                | 2 +-
 tests/pipelines/test_image_semantic_segmentation.py      | 2 +-
 tests/pipelines/test_image_style_transfer.py             | 2 +-
 tests/pipelines/test_image_super_resolution.py           | 2 +-
 tests/pipelines/test_key_word_spotting.py                | 2 +-
 tests/pipelines/test_live_category.py                    | 2 +-
 tests/pipelines/test_movie_scene_segmentation.py         | 2 +-
 tests/pipelines/test_mplug_tasks.py                      | 2 +-
 tests/pipelines/test_multi_modal_embedding.py            | 2 +-
 tests/pipelines/test_named_entity_recognition.py         | 2 +-
 tests/pipelines/test_nli.py                              | 2 +-
 tests/pipelines/test_object_detection.py                 | 2 +-
 tests/pipelines/test_ocr_detection.py                    | 2 +-
 tests/pipelines/test_ocr_recognition.py                  | 2 +-
 tests/pipelines/test_ofa_tasks.py                        | 2 +-
 tests/pipelines/test_person_image_cartoon.py             | 2 +-
 tests/pipelines/test_product_retrieval_embedding.py      | 2 +-
 tests/pipelines/test_realtime_object_detection.py        | 2 +-
 tests/pipelines/test_relation_extraction.py              | 2 +-
 tests/pipelines/test_salient_detection.py                | 2 +-
 tests/pipelines/test_sentence_similarity.py              | 2 +-
 tests/pipelines/test_sentiment_classification.py         | 2 +-
 tests/pipelines/test_skin_retouching.py                  | 2 +-
 tests/pipelines/test_speech_signal_process.py            | 2 +-
 tests/pipelines/test_text_classification.py              | 2 +-
 tests/pipelines/test_text_driven_segmentation.py         | 2 +-
 tests/pipelines/test_text_error_correction.py            | 2 +-
 tests/pipelines/test_text_generation.py                  | 2 +-
 tests/pipelines/test_text_to_image_synthesis.py          | 2 +-
 tests/pipelines/test_text_to_speech.py                   | 2 +-
 tests/pipelines/test_tinynas_classification.py           | 2 +-
 tests/pipelines/test_tinynas_detection.py                | 2 +-
 tests/pipelines/test_video_category.py                   | 2 +-
 tests/pipelines/test_video_multi_modal_embedding.py      | 2 +-
 tests/pipelines/test_video_single_object_tracking.py     | 2 +-
 tests/pipelines/test_video_summarization.py              | 2 +-
 tests/pipelines/test_virtual_try_on.py                   | 2 +-
 tests/pipelines/test_word_segmentation.py                | 2 +-
 tests/pipelines/test_zero_shot_classification.py         | 2 +-
 49 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 3c4327be..e475c3cd 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -257,7 +257,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 2a4cade1..68eae385 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -22,7 +22,7 @@ class CMDSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'video embedding output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index 464c0d36..9232ebd4 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -68,7 +68,7 @@ class GEMMMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         output = generative_multi_modal_embedding_pipeline(test_input)
         print(output)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
index dea2e020..8a7de1fa 100644
--- a/tests/pipelines/test_hicossl_video_embedding.py
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -23,7 +23,7 @@ class HICOSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'video embedding output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_colorization.py b/tests/pipelines/test_image_colorization.py
index a4b132ab..547fce89 100644
--- a/tests/pipelines/test_image_colorization.py
+++ b/tests/pipelines/test_image_colorization.py
@@ -37,7 +37,7 @@ class ImageColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
         image_colorization = pipeline(Tasks.image_colorization)
         self.pipeline_inference(image_colorization, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index 4a9df462..bf8cfd0f 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -61,7 +61,7 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         w, h = denoise_img.size
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_instance_segmentation.py b/tests/pipelines/test_image_instance_segmentation.py
index 520bc99c..2ba0724a 100644
--- a/tests/pipelines/test_image_instance_segmentation.py
+++ b/tests/pipelines/test_image_instance_segmentation.py
@@ -61,7 +61,7 @@ class ImageInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         print(f'pipeline1:{pipeline1(input=self.image)[OutputKeys.LABELS]}')
         print(f'pipeline2: {pipeline2(input=self.image)[OutputKeys.LABELS]}')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 2d78f164..a3edb705 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -61,7 +61,7 @@ class ImageMattingTest(unittest.TestCase, DemoCompatibilityCheck):
             f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'
         )
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
index 8c23ee6c..a1657585 100644
--- a/tests/pipelines/test_image_panoptic_segmentation.py
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -38,7 +38,7 @@ class ImagePanopticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         cv2.imwrite('result.jpg', draw_img)
         print('print test_image_panoptic_segmentation from PIL return success')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_portrait_enhancement.py b/tests/pipelines/test_image_portrait_enhancement.py
index 83a70a0c..1ca97253 100644
--- a/tests/pipelines/test_image_portrait_enhancement.py
+++ b/tests/pipelines/test_image_portrait_enhancement.py
@@ -39,7 +39,7 @@ class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
         face_enhancement = pipeline(Tasks.image_portrait_enhancement)
         self.pipeline_inference(face_enhancement, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_reid_person.py b/tests/pipelines/test_image_reid_person.py
index a4074b58..310cdd66 100644
--- a/tests/pipelines/test_image_reid_person.py
+++ b/tests/pipelines/test_image_reid_person.py
@@ -50,7 +50,7 @@ class ImageReidPersonTest(unittest.TestCase, DemoCompatibilityCheck):
         )
         print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
index 82e606a3..c7876906 100644
--- a/tests/pipelines/test_image_semantic_segmentation.py
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -51,7 +51,7 @@ class ImageSemanticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         cv2.imwrite('result.jpg', draw_img)
         print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index 4b596cc9..a02d5308 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -50,7 +50,7 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_image_super_resolution.py b/tests/pipelines/test_image_super_resolution.py
index cd3822c3..d5cbebe8 100644
--- a/tests/pipelines/test_image_super_resolution.py
+++ b/tests/pipelines/test_image_super_resolution.py
@@ -37,7 +37,7 @@ class ImageSuperResolutionTest(unittest.TestCase, DemoCompatibilityCheck):
         super_resolution = pipeline(Tasks.image_super_resolution)
         self.pipeline_inference(super_resolution, self.img)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 2f06936f..91f9f566 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -296,7 +296,7 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_roc', kws_result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_live_category.py b/tests/pipelines/test_live_category.py
index 835bc602..391ed283 100644
--- a/tests/pipelines/test_live_category.py
+++ b/tests/pipelines/test_live_category.py
@@ -21,7 +21,7 @@ class LiveCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'live category output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
index e2fdc224..affd5140 100644
--- a/tests/pipelines/test_movie_scene_segmentation.py
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -35,7 +35,7 @@ class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         else:
             raise ValueError('process error')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 55930b13..273d3105 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -80,7 +80,7 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = pipeline_retrieval(input)
         print(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 3d296370..23954c27 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -59,7 +59,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         print('l2-norm: {}'.format(torch.norm(text_embedding,
                                               dim=-1).item()))  # should be 1.0
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 2c8d7b70..9fae2d09 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -94,7 +94,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 80c69a01..a53ac3b3 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -57,7 +57,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.nli)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index a754a517..2a74eb41 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -55,7 +55,7 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         else:
             raise ValueError('process error')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index eeaa9d7a..e0591496 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -30,7 +30,7 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         ocr_detection = pipeline(Tasks.ocr_detection)
         self.pipeline_inference(ocr_detection, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index c4eb9e7a..8d48dd7a 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -37,7 +37,7 @@ class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
         ocr_recognition = pipeline(Tasks.ocr_recognition)
         self.pipeline_inference(ocr_recognition, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 455b196b..9a72d1ff 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -252,7 +252,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index ef30d702..5c81cd28 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -36,7 +36,7 @@ class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)
         self.pipeline_inference(img_cartoon, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_product_retrieval_embedding.py b/tests/pipelines/test_product_retrieval_embedding.py
index f2b0a33d..235847be 100644
--- a/tests/pipelines/test_product_retrieval_embedding.py
+++ b/tests/pipelines/test_product_retrieval_embedding.py
@@ -39,7 +39,7 @@ class ProductRetrievalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         result = product_embed(self.img_input)[OutputKeys.IMG_EMBEDDING]
         print('abs sum value is: {}'.format(np.sum(np.abs(result))))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
index 25e8ffd4..e04f6b5c 100644
--- a/tests/pipelines/test_realtime_object_detection.py
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -47,7 +47,7 @@ class RealtimeObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         else:
             raise ValueError('process error')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index d9e260f2..57d98f66 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -55,7 +55,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.information_extraction)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index 52e84be7..e87e9388 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -24,7 +24,7 @@ class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index d9da1e65..4079455d 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -63,7 +63,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 939b7360..3db9971a 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -66,7 +66,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_skin_retouching.py b/tests/pipelines/test_skin_retouching.py
index 9e73334c..db8d89ed 100644
--- a/tests/pipelines/test_skin_retouching.py
+++ b/tests/pipelines/test_skin_retouching.py
@@ -41,7 +41,7 @@ class SkinRetouchingTest(unittest.TestCase, DemoCompatibilityCheck):
         skin_retouching = pipeline(Tasks.skin_retouching)
         self.pipeline_inference(skin_retouching, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index e1987c28..517facae 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -113,7 +113,7 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
             ans(data, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 3a2870ea..71b9f3e2 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -89,7 +89,7 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
         result = text_classification(dataset)
         self.printDataset(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
index a693edac..a67729ff 100644
--- a/tests/pipelines/test_text_driven_segmentation.py
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -23,7 +23,7 @@ class TextDrivenSegmentationTest(unittest.TestCase):
         # result[OutputKeys.MASKS] is segment map result,other keys are not used
         cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.test_demo()
 
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index 3400fbb7..a714d3d0 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -55,7 +55,7 @@ class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.text_error_correction)
         print(pipeline_ins(self.input))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 2a4d470d..66f9c9da 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -129,7 +129,7 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.palm_input_zh))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py
index 5a5ed357..0da6768a 100644
--- a/tests/pipelines/test_text_to_image_synthesis.py
+++ b/tests/pipelines/test_text_to_image_synthesis.py
@@ -51,7 +51,7 @@ class TextToImageSynthesisTest(unittest.TestCase, DemoCompatibilityCheck):
             self.test_text)[OutputKeys.OUTPUT_IMG]
         print(np.sum(np.abs(img)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 0a075352..374f0fd2 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -38,7 +38,7 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
         pcm = output[OutputKeys.OUTPUT_PCM]
         write('output.wav', 16000, pcm)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_tinynas_classification.py b/tests/pipelines/test_tinynas_classification.py
index da5ca933..204b8bdb 100644
--- a/tests/pipelines/test_tinynas_classification.py
+++ b/tests/pipelines/test_tinynas_classification.py
@@ -19,7 +19,7 @@ class TinyNASClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
         result = tinynas_classification('data/test/images/image_wolf.jpeg')
         print(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index e9eaeb59..b13644be 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -15,7 +15,7 @@ class TinynasObjectDetectionTest(unittest.TestCase):
             'data/test/images/image_detection.jpg')
         print(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.test_demo()
 
diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py
index 98890bef..660196b8 100644
--- a/tests/pipelines/test_video_category.py
+++ b/tests/pipelines/test_video_category.py
@@ -21,7 +21,7 @@ class VideoCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'video category output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_video_multi_modal_embedding.py b/tests/pipelines/test_video_multi_modal_embedding.py
index 9e26c967..f4aa4d24 100644
--- a/tests/pipelines/test_video_multi_modal_embedding.py
+++ b/tests/pipelines/test_video_multi_modal_embedding.py
@@ -41,7 +41,7 @@ class VideoMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         logger.info('video feature: {}'.format(
             output['video_embedding'][0][0][0]))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
index 51d39c20..7f3a9226 100644
--- a/tests/pipelines/test_video_single_object_tracking.py
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -35,7 +35,7 @@ class SingleObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
         result = video_single_object_tracking((video_path, init_bbox))
         print('result is : ', result[OutputKeys.BOXES])
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 67c0cbd1..6dcc31e9 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -33,7 +33,7 @@ class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'video summarization output:\n {result}.')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_virtual_try_on.py b/tests/pipelines/test_virtual_try_on.py
index 07132c8a..e1dd78a2 100644
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -34,7 +34,7 @@ class VirtualTryonTest(unittest.TestCase, DemoCompatibilityCheck):
         img = pipeline_virtual_tryon(self.input_imgs)[OutputKeys.OUTPUT_IMG]
         cv2.imwrite('demo.jpg', img[:, :, ::-1])
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 835f59e7..cd01b98f 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -59,7 +59,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.word_segmentation)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index cdf6f31e..da1854c9 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -70,7 +70,7 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
         print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
 

From f4a47d006acaab929238093ceb18ecb6772d61ec Mon Sep 17 00:00:00 2001
From: "hejunjie.hjj" <hejunjie.hjj@alibaba-inc.com>
Date: Sat, 10 Sep 2022 12:57:32 +0800
Subject: [PATCH 091/175] [to #42322933] format boxes output [x, y, w, h] to
 [x1, y1, x2, y2]         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10090172

    * fix bug when pipeline input is Image.Image or numpy.ndarray
---
 .../postprocess_utils.py                      | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 43e52292..531e2efd 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -105,12 +105,12 @@ def get_img_ins_seg_result(img_seg_result=None,
     }
     for seg_result in img_seg_result:
 
-        box = {
-            'x': np.int(seg_result[0]),
-            'y': np.int(seg_result[1]),
-            'w': np.int(seg_result[2] - seg_result[0]),
-            'h': np.int(seg_result[3] - seg_result[1])
-        }
+        box = [
+            np.int(seg_result[0]),
+            np.int(seg_result[1]),
+            np.int(seg_result[2]),
+            np.int(seg_result[3])
+        ]
         score = np.float(seg_result[4])
         category = seg_result[5]
 
@@ -161,12 +161,10 @@ def show_result(
             np.random.random() * 255.0
         ])
 
-        x1 = int(box['x'])
-        y1 = int(box['y'])
-        w = int(box['w'])
-        h = int(box['h'])
-        x2 = x1 + w
-        y2 = y1 + h
+        x1 = int(box[0])
+        y1 = int(box[1])
+        x2 = int(box[2])
+        y2 = int(box[3])
 
         if show_box:
             cv2.rectangle(

From 54e1a6d88b73717b44338f1616f7f15f144d2c87 Mon Sep 17 00:00:00 2001
From: "dingkun.ldk" <dingkun.ldk@alibaba-inc.com>
Date: Sat, 10 Sep 2022 15:59:56 +0800
Subject: [PATCH 092/175] =?UTF-8?q?[to=20#42322933]830NLP=20=E7=AF=87?=
 =?UTF-8?q?=E7=AB=A0=E6=8E=92=E5=BA=8F/=E6=96=87=E6=9C=AC=E8=A1=A8?=
 =?UTF-8?q?=E7=A4=BA=E6=A8=A1=E5=9E=8B=E4=BB=A3=E7=A0=81check=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/Ma?=
 =?UTF-8?q?aS-lib/codereview/9856179?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |   6 +
 modelscope/models/nlp/__init__.py             |   4 +
 modelscope/models/nlp/passage_ranking.py      |  78 +++++++
 modelscope/models/nlp/sentence_embedding.py   |  74 +++++++
 .../msdatasets/task_datasets/__init__.py      |   2 +
 .../task_datasets/passage_ranking_dataset.py  | 151 ++++++++++++++
 modelscope/outputs.py                         |   9 +-
 modelscope/pipelines/builder.py               |   5 +
 modelscope/pipelines/nlp/__init__.py          |   5 +-
 .../pipelines/nlp/passage_ranking_pipeline.py |  58 ++++++
 .../nlp/sentence_embedding_pipeline.py        |  60 ++++++
 modelscope/preprocessors/__init__.py          |   4 +-
 modelscope/preprocessors/nlp.py               | 104 ++++++++-
 modelscope/trainers/__init__.py               |   4 +-
 modelscope/trainers/nlp/__init__.py           |   2 +
 .../trainers/nlp/passage_ranking_trainer.py   | 197 ++++++++++++++++++
 modelscope/trainers/trainer.py                |  12 +-
 modelscope/utils/constant.py                  |   2 +
 tests/pipelines/test_passage_ranking.py       |  61 ++++++
 tests/pipelines/test_sentence_embedding.py    |  82 ++++++++
 .../trainers/test_finetune_passage_ranking.py | 133 ++++++++++++
 21 files changed, 1035 insertions(+), 18 deletions(-)
 create mode 100644 modelscope/models/nlp/passage_ranking.py
 create mode 100644 modelscope/models/nlp/sentence_embedding.py
 create mode 100644 modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
 create mode 100644 modelscope/pipelines/nlp/passage_ranking_pipeline.py
 create mode 100644 modelscope/pipelines/nlp/sentence_embedding_pipeline.py
 create mode 100644 modelscope/trainers/nlp/passage_ranking_trainer.py
 create mode 100644 tests/pipelines/test_passage_ranking.py
 create mode 100644 tests/pipelines/test_sentence_embedding.py
 create mode 100644 tests/trainers/test_finetune_passage_ranking.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 63b4f1c2..e5c3873b 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -193,6 +193,8 @@ class Pipelines(object):
     plug_generation = 'plug-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
 
@@ -245,6 +247,7 @@ class Trainers(object):
     dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
+    nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -272,6 +275,7 @@ class Preprocessors(object):
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
+    cross_encoder_tokenizer = 'cross-encoder-tokenizer'
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     token_cls_tokenizer = 'token-cls-tokenizer'
@@ -284,6 +288,8 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index a3a12c22..d411f1fb 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -29,6 +29,8 @@ if TYPE_CHECKING:
                               SingleBackboneTaskModelBase,
                               TokenClassificationModel)
     from .token_classification import SbertForTokenClassification
+    from .sentence_embedding import SentenceEmbedding
+    from .passage_ranking import PassageRanking
 
 else:
     _import_structure = {
@@ -62,6 +64,8 @@ else:
             'SingleBackboneTaskModelBase', 'TokenClassificationModel'
         ],
         'token_classification': ['SbertForTokenClassification'],
+        'sentence_embedding': ['SentenceEmbedding'],
+        'passage_ranking': ['PassageRanking'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py
new file mode 100644
index 00000000..68bca231
--- /dev/null
+++ b/modelscope/models/nlp/passage_ranking.py
@@ -0,0 +1,78 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp import SbertForSequenceClassification
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRanking']
+
+
+@MODELS.register_module(Tasks.passage_ranking, module_name=Models.bert)
+class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir, *args, **kwargs):
+        if hasattr(config, 'base_model_prefix'):
+            PassageRanking.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        self.register_buffer(
+            'target_label',
+            torch.zeros(self.train_batch_size, dtype=torch.long))
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=True)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = self.base_model.forward(**input)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(self.train_batch_size, -1)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(scores, self.target_label)
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
+
+    def sigmoid(self, logits):
+        return np.exp(logits) / (1 + np.exp(logits))
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        logits = inputs['logits'].squeeze(-1).detach().cpu().numpy()
+        logits = self.sigmoid(logits).tolist()
+        result = {OutputKeys.SCORES: logits}
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (1 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        num_labels = kwargs.get('num_labels', 1)
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+
+        return super(SbertPreTrainedModel, PassageRanking).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
new file mode 100644
index 00000000..955c0e53
--- /dev/null
+++ b/modelscope/models/nlp/sentence_embedding.py
@@ -0,0 +1,74 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbedding']
+
+
+@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
+class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=False)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'predictions': array([1]), # lable 0-negative 1-positive
+                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        return self.base_model(**input)
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
+        num_sent = embs.shape[0]
+        if num_sent >= 2:
+            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
+                                                      (1, 0))).tolist()[0]
+        else:
+            scores = []
+        result = {'text_embedding': embs, 'scores': scores}
+
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_args = {}
+
+        return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index f97ff8b2..e2bf5bc1 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -11,12 +11,14 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
     from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+    from .passage_ranking_dataset import PassageRankingDataset
 
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
+        'passage_ranking_dataset': ['PassageRankingDataset'],
         'veco_dataset': ['VecoDataset'],
         'image_instance_segmentation_coco_dataset':
         ['ImageInstanceSegmentationCocoDataset'],
diff --git a/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
new file mode 100644
index 00000000..517e0d36
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
@@ -0,0 +1,151 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+from datasets import Dataset, IterableDataset, concatenate_datasets
+from torch.utils.data import ConcatDataset
+from transformers import DataCollatorWithPadding
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModeKeys, Tasks
+from .base import TaskDataset
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.passage_ranking, module_name=Models.bert)
+class PassageRankingDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 *args,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        self.dataset_config = kwargs
+        self.query_sequence = self.dataset_config.get('query_sequence',
+                                                      'query')
+        self.pos_sequence = self.dataset_config.get('pos_sequence',
+                                                    'positive_passages')
+        self.neg_sequence = self.dataset_config.get('neg_sequence',
+                                                    'negative_passages')
+        self.passage_text_fileds = self.dataset_config.get(
+            'passage_text_fileds', ['title', 'text'])
+        self.qid_field = self.dataset_config.get('qid_field', 'query_id')
+        if mode == ModeKeys.TRAIN:
+            train_config = kwargs.get('train', {})
+            self.neg_samples = train_config.get('neg_samples', 4)
+
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def __getitem__(self, index) -> Any:
+        if self.mode == ModeKeys.TRAIN:
+            return self.__get_train_item__(index)
+        else:
+            return self.__get_test_item__(index)
+
+    def __get_test_item__(self, index):
+        group = self._inner_dataset[index]
+        labels = []
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+        labels.extend([1] * len(pos_sequences))
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        labels.extend([0] * len(neg_sequences))
+        qid = group[self.qid_field]
+
+        examples = pos_sequences + neg_sequences
+        sample = {
+            'qid': torch.LongTensor([int(qid)] * len(labels)),
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+            'labels': torch.LongTensor(labels)
+        }
+        return self.prepare_sample(sample)
+
+    def __get_train_item__(self, index):
+        group = self._inner_dataset[index]
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        pos_psg = random.choice(pos_sequences)
+
+        if len(neg_sequences) < self.neg_samples:
+            negs = random.choices(neg_sequences, k=self.neg_samples)
+        else:
+            negs = random.sample(neg_sequences, k=self.neg_samples)
+        examples = [pos_psg] + negs
+        sample = {
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+        }
+        return self.prepare_sample(sample)
+
+    def __len__(self):
+        return len(self._inner_dataset)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Prepare a dataset.
+
+        User can process the input datasets in a whole dataset perspective.
+        This method gives a default implementation of datasets merging, user can override this
+        method to write custom logics.
+
+        Args:
+            datasets: The original dataset(s)
+
+        Returns: A single dataset, which may be created after merging.
+
+        """
+        if isinstance(datasets, List):
+            if len(datasets) == 1:
+                return datasets[0]
+            elif len(datasets) > 1:
+                return ConcatDataset(datasets)
+        else:
+            return datasets
+
+    def prepare_sample(self, data):
+        """Preprocess the data fetched from the inner_dataset.
+
+        If the preprocessor is None, the original data will be returned, else the preprocessor will be called.
+        User can override this method to implement custom logics.
+
+        Args:
+            data: The data fetched from the dataset.
+
+        Returns: The processed data.
+
+        """
+        return self.preprocessor(
+            data) if self.preprocessor is not None else data
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 37ab3481..8ddeb314 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -387,19 +387,14 @@ TASK_OUTPUTS = {
     #    "output": "我想吃苹果"
     # }
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
-
+    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+    Tasks.passage_ranking: [OutputKeys.SCORES],
     # text generation result for single sample
     # {
     #   "text": "this is the text generated by a model."
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
-    # text feature extraction for single sample
-    # {
-    #   "text_embedding": np.array with shape [1, D]
-    # }
-    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING],
-
     # fill mask result for single sample
     # {
     #   "text": "this is the text which masks filled by model."
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index a1f093a3..50313cf7 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -17,6 +17,11 @@ PIPELINES = Registry('pipelines')
 
 DEFAULT_MODEL_FOR_PIPELINE = {
     # TaskName: (pipeline_module_name, model_repo)
+    Tasks.sentence_embedding:
+    (Pipelines.sentence_embedding,
+     'damo/nlp_corom_sentence-embedding_english-base'),
+    Tasks.passage_ranking: (Pipelines.passage_ranking,
+                            'damo/nlp_corom_passage-ranking_english-base'),
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 42dfc972..6f898c0f 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -25,7 +25,8 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
-
+    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
 else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
@@ -55,6 +56,8 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
+        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
new file mode 100644
index 00000000..c03e7b93
--- /dev/null
+++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
@@ -0,0 +1,58 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import PassageRankingPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRankingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.passage_ranking, module_name=Pipelines.passage_ranking)
+class PassageRankingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = PassageRankingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        pred_list = inputs[OutputKeys.SCORES]
+
+        return {OutputKeys.SCORES: pred_list}
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
new file mode 100644
index 00000000..3ef6d06b
--- /dev/null
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -0,0 +1,60 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      SentenceEmbeddingPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbeddingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentence_embedding, module_name=Pipelines.sentence_embedding)
+class SentenceEmbeddingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='first_sequence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp text dual encoder then generates the text representation.
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = SentenceEmbeddingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        embs = inputs[OutputKeys.TEXT_EMBEDDING]
+        scores = inputs[OutputKeys.SCORES]
+        return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 6012b5ba..212339ae 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -23,7 +23,8 @@ if TYPE_CHECKING:
         ZeroShotClassificationPreprocessor, NERPreprocessor,
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
-        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor)
+        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
+        PassageRankingPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -50,6 +51,7 @@ else:
             'SingleSentenceClassificationPreprocessor',
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
+            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 9137b105..e20adaa6 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -29,6 +29,7 @@ __all__ = [
     'PairSentenceClassificationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
+    'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
     'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
     'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor',
     'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
@@ -100,6 +101,7 @@ class SequenceClassificationPreprocessor(Preprocessor):
 
         text_a = new_data[self.first_sequence]
         text_b = new_data.get(self.second_sequence, None)
+
         feature = self.tokenizer(
             text_a,
             text_b,
@@ -111,7 +113,6 @@ class SequenceClassificationPreprocessor(Preprocessor):
         rst['input_ids'].append(feature['input_ids'])
         rst['attention_mask'].append(feature['attention_mask'])
         rst['token_type_ids'].append(feature['token_type_ids'])
-
         return rst
 
 
@@ -268,6 +269,62 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 output[OutputKeys.LABELS] = labels
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.passage_ranking)
+class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in passage ranking model.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'source_sentence')
+        self.second_sequence = kwargs.pop('second_sequence',
+                                          'sentences_to_compare')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
+        if isinstance(data, tuple):
+            sentence1, sentence2 = data
+        elif isinstance(data, dict):
+            sentence1 = data.get(self.first_sequence)
+            sentence2 = data.get(self.second_sequence)
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+
+        max_seq_length = self.sequence_length
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_tensors='pt')
+        if 'labels' in data:
+            labels = data['labels']
+            feature['labels'] = labels
+        if 'qid' in data:
+            qid = data['qid']
+            feature['qid'] = qid
+        return feature
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 @PREPROCESSORS.register_module(
@@ -298,6 +355,51 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_embedding)
+class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sentence embedding.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict:
+                keys: "source_sentence" && "sentences_to_compare"
+                values: list of sentences
+                Example:
+                    {"source_sentence": ["how long it take to get a master's degree"],
+                     "sentences_to_compare": ["On average, students take about 18 to 24 months
+                     to complete a master's degree.",
+                     "On the other hand, some students prefer to go at a slower pace
+                     and choose to take several years to complete their studies.",
+                     "It can take anywhere from two semesters"]}
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        source_sentence = data['source_sentence']
+        compare_sentences = data['sentences_to_compare']
+        sentences = []
+        sentences.append(source_sentence[0])
+        for sent in compare_sentences:
+            sentences.append(sent)
+
+        tokenized_inputs = self.tokenizer(
+            sentences,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            padding=True,
+            truncation=True)
+        return tokenized_inputs
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
 class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 8f8938c8..a632642a 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
                      ImagePortraitEnhancementTrainer,
                      MovieSceneSegmentationTrainer)
     from .multi_modal import CLIPTrainer
-    from .nlp import SequenceClassificationTrainer
+    from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
     from .trainer import EpochBasedTrainer
 
@@ -25,7 +25,7 @@ else:
             'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
-        'nlp': ['SequenceClassificationTrainer'],
+        'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
         'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer']
     }
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 7ab8fd70..001cfefc 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
+    from .passage_ranking_trainer import PassageRankingTranier
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
         'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
+        'passage_ranking_trainer': ['PassageRankingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py
new file mode 100644
index 00000000..e54c2904
--- /dev/null
+++ b/modelscope/trainers/nlp/passage_ranking_trainer.py
@@ -0,0 +1,197 @@
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class GroupCollator():
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(features[0], list):
+            features = sum(features, [])
+        keys = features[0].keys()
+        batch = {k: list() for k in keys}
+        for ele in features:
+            for k, v in ele.items():
+                batch[k].append(v)
+        batch = {k: torch.cat(v, dim=0) for k, v in batch.items()}
+        return batch
+
+
+@TRAINERS.register_module(module_name=Trainers.nlp_passage_ranking_trainer)
+class PassageRankingTrainer(NlpEpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        if data_collator is None:
+            data_collator = GroupCollator()
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            cfg_modify_fn=cfg_modify_fn,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_revision=model_revision,
+            **kwargs)
+
+    def compute_mrr(self, result, k=10):
+        mrr = 0
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
+            ar = 0
+            for index, ele in enumerate(sorted_res[:k]):
+                if str(ele[1]) == '1':
+                    ar = 1.0 / (index + 1)
+                    break
+            mrr += ar
+        return mrr / len(result)
+
+    def compute_ndcg(self, result, k=10):
+        ndcg = 0
+        from sklearn import ndcg_score
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
+            labels = np.array([[ele[1] for ele in sorted_res]])
+            scores = np.array([[ele[0] for ele in sorted_res]])
+            ndcg += float(ndcg_score(labels, scores, k=k))
+        ndcg = ndcg / len(result)
+        return ndcg
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        """evaluate a dataset
+
+        evaluate a dataset via a specific model from the `checkpoint_path` path, if the `checkpoint_path`
+        does not exist, read from the config file.
+
+        Args:
+            checkpoint_path (Optional[str], optional): the model path. Defaults to None.
+
+        Returns:
+            Dict[str, float]: the results about the evaluation
+            Example:
+            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
+        """
+        from modelscope.models.nlp import PassageRanking
+        # get the raw online dataset
+        self.eval_dataloader = self._build_dataloader_with_dataset(
+            self.eval_dataset,
+            **self.cfg.evaluation.get('dataloader', {}),
+            collate_fn=self.eval_data_collator)
+        # generate a standard dataloader
+        # generate a model
+        if checkpoint_path is not None:
+            model = PassageRanking.from_pretrained(checkpoint_path)
+        else:
+            model = self.model
+
+        # copy from easynlp (start)
+        model.eval()
+        total_samples = 0
+
+        logits_list = list()
+        label_list = list()
+        qid_list = list()
+
+        total_spent_time = 0.0
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        model.to(device)
+        for _step, batch in enumerate(self.eval_dataloader):
+            try:
+                batch = {
+                    key:
+                    val.to(device) if isinstance(val, torch.Tensor) else val
+                    for key, val in batch.items()
+                }
+            except RuntimeError:
+                batch = {key: val for key, val in batch.items()}
+
+            infer_start_time = time.time()
+            with torch.no_grad():
+                label_ids = batch.pop('labels').detach().cpu().numpy()
+                qids = batch.pop('qid').detach().cpu().numpy()
+                outputs = model(batch)
+            infer_end_time = time.time()
+            total_spent_time += infer_end_time - infer_start_time
+            total_samples += self.eval_dataloader.batch_size
+
+            assert 'scores' in outputs
+            logits = outputs['scores']
+
+            label_list.extend(label_ids)
+            logits_list.extend(logits)
+            qid_list.extend(qids)
+
+        logger.info('Inference time = {:.2f}s, [{:.4f} ms / sample] '.format(
+            total_spent_time, total_spent_time * 1000 / total_samples))
+
+        rank_result = {}
+        for qid, score, label in zip(qid_list, logits_list, label_list):
+            if qid not in rank_result:
+                rank_result[qid] = []
+            rank_result[qid].append((score, label))
+
+        for qid in rank_result:
+            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
+
+        eval_outputs = list()
+        for metric in self.metrics:
+            if metric.startswith('mrr'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                mrr = self.compute_mrr(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, mrr))
+                eval_outputs.append((metric, mrr))
+            elif metric.startswith('ndcg'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                ndcg = self.compute_ndcg(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, ndcg))
+                eval_outputs.append(('ndcg', ndcg))
+            else:
+                raise NotImplementedError('Metric %s not implemented' % metric)
+
+        return dict(eval_outputs)
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 63a231b3..8dc75a65 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -345,12 +345,12 @@ class EpochBasedTrainer(BaseTrainer):
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
             else:
-                task_data_config.update(
-                    dict(
-                        mode=mode,
-                        datasets=datasets,
-                        preprocessor=preprocessor))
-                return build_task_dataset(task_data_config, self.cfg.task)
+                # avoid add no str value datasets, preprocessors in cfg
+                task_data_build_config = ConfigDict(
+                    mode=mode, datasets=datasets, preprocessor=preprocessor)
+                task_data_build_config.update(task_data_config)
+                return build_task_dataset(task_data_build_config,
+                                          self.cfg.task)
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 6d84925c..57d38da7 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -89,6 +89,8 @@ class NLPTasks(object):
     sentiment_analysis = 'sentiment-analysis'
     sentence_similarity = 'sentence-similarity'
     text_classification = 'text-classification'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     zero_shot = 'zero-shot'
     translation = 'translation'
diff --git a/tests/pipelines/test_passage_ranking.py b/tests/pipelines/test_passage_ranking.py
new file mode 100644
index 00000000..5faa365e
--- /dev/null
+++ b/tests/pipelines/test_passage_ranking.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import PassageRanking
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import PassageRankingPipeline
+from modelscope.preprocessors import PassageRankingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PassageRankingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_passage-ranking_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = PassageRankingPreprocessor(cache_path)
+        model = PassageRanking.from_pretrained(cache_path)
+        pipeline1 = PassageRankingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = PassageRankingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.passage_ranking)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
new file mode 100644
index 00000000..739dd7ab
--- /dev/null
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -0,0 +1,82 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SentenceEmbedding
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
+from modelscope.preprocessors import SentenceEmbeddingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentenceEmbeddingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take ',
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    inputs2 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree."
+        ]
+    }
+
+    inputs3 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': []
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(cache_path)
+        model = SentenceEmbedding.from_pretrained(cache_path)
+        pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(f'inputs: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+        print()
+        print(f'inputs: {self.inputs2}\n'
+              f'pipeline1:{pipeline1(input=self.inputs2)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs2)}')
+        print(f'inputs: {self.inputs3}\n'
+              f'pipeline1:{pipeline1(input=self.inputs3)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs3)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.sentence_embedding)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_passage_ranking.py b/tests/trainers/test_finetune_passage_ranking.py
new file mode 100644
index 00000000..f833f981
--- /dev/null
+++ b/tests/trainers/test_finetune_passage_ranking.py
@@ -0,0 +1,133 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+class TestFinetuneSequenceClassification(unittest.TestCase):
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_passage_ranking_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    def test_finetune_msmarco(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'passage-ranking'
+            cfg['preprocessor'] = {'type': 'passage-ranking'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['train']['neg_samples'] = 4
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
+            cfg.train.max_epochs = 1
+            cfg.train.train_batch_size = 4
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 3000
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('passage-ranking-demo', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['train'].to_hf_dataset()
+
+        self.finetune(
+            model_id='damo/nlp_corom_passage-ranking_english-base',
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_passage_ranking(output_dir)
+
+    def pipeline_passage_ranking(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.passage_ranking, model=model)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 269faa8bce8b36d602ab3c0190eaff9164a47301 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 13 Sep 2022 10:37:59 +0800
Subject: [PATCH 093/175] [to #43878396] bump version to 0.4.0

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index d93912ee..abeeedbf 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.7'
+__version__ = '0.4.0'

From c35f8cb42b73a92935460b2639e189c67c2d11d4 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 13 Sep 2022 16:09:35 +0800
Subject: [PATCH 094/175] [to #42322933] remove deepspeed and fariseq from
 requirments

---
 modelscope/utils/error.py        | 15 +++++++++++++++
 modelscope/utils/import_utils.py |  2 ++
 requirements/multi-modal.txt     |  1 -
 requirements/nlp.txt             |  2 --
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index e7d1442f..a6bbc8b3 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -96,3 +96,18 @@ DECORD_IMPORT_ERROR = """
 {0} requires the decord library but it was not found in your environment. You can install it with pip:
 `pip install decord>=0.6.0`
 """
+
+# docstyle-ignore
+DEEPSPEED_IMPORT_ERROR = """
+{0} requires the Deepspeed library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.deepspeed.ai/tutorials/advanced-install/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+FAIRSEQ_IMPORT_ERROR = """
+{0} requires the fairseq library but it was not found in your environment.
+You can install it with pip on linux:
+`pip install fairseq`
+On windows, please checkout the instructions on the
+installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment.
+"""
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index c9bea020..2a6fdc80 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -290,6 +290,8 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('easyasr', (is_package_available('easyasr'), AUDIO_IMPORT_ERROR)),
     ('kwsbp', (is_package_available('kwsbp'), AUDIO_IMPORT_ERROR)),
     ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)),
+    ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
+    ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index ef5d4341..02e87baa 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,4 +1,3 @@
-fairseq
 ftfy>=6.0.3
 ofa>=0.0.2
 pycocoevalcap>=1.2
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index cf0468bb..15f2f41a 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,6 +1,4 @@
-deepspeed
 en_core_web_sm>=2.3.5
-fairseq>=0.10.2
 jieba>=0.42.1
 megatron_util
 pai-easynlp

From e9eeb05bcd8ea65f640157308026fe3c7db64dd4 Mon Sep 17 00:00:00 2001
From: "yingda.chen" <yingda.chen@alibaba-inc.com>
Date: Tue, 13 Sep 2022 18:04:01 +0800
Subject: [PATCH 095/175] [to #42322933] specify ast scan file open encoding   
      Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10090797

    * [to #42322933] specify ast scan file open encoding
---
 modelscope/utils/ast_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 263a81b3..62c31397 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -394,7 +394,7 @@ class AstScaning(object):
 
     def generate_ast(self, file):
         self._refresh()
-        with open(file, 'r') as code:
+        with open(file, 'r', encoding='utf8') as code:
             data = code.readlines()
         data = ''.join(data)
 

From 06787d66d8c9e98740c0570abf2fcb923a9d2b3f Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 13 Sep 2022 18:48:55 +0800
Subject: [PATCH 096/175] [to #42322933] fix forward input in token
 classification         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10103632

---
 modelscope/pipelines/nlp/word_segmentation_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 9899243e..7e8b22bc 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -62,7 +62,7 @@ class WordSegmentationPipeline(Pipeline):
         text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
             return {
-                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
             }
 
     def postprocess(self, inputs: Dict[str, Any],

From 3664805d9893dfa4c21f0a5e7bcc33f27e296b23 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 13 Sep 2022 20:23:35 +0800
Subject: [PATCH 097/175] [to #43878347] remove automatically model placement
 which will result in full memory usage

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10105641
---
 modelscope/models/base/base_model.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 8744ce1c..cdc71fcf 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -91,7 +91,6 @@ class Model(ABC):
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        framework = cfg.framework
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
@@ -101,9 +100,8 @@ class Model(ABC):
             model_cfg[k] = v
         if device is not None:
             model_cfg.device = device
-            with device_placement(framework, device):
-                model = build_model(
-                    model_cfg, task_name=task_name, default_args=kwargs)
+            model = build_model(
+                model_cfg, task_name=task_name, default_args=kwargs)
         else:
             model = build_model(
                 model_cfg, task_name=task_name, default_args=kwargs)

From ff58300d09c5aeb3d0d0d3c6553e8d7ad70b57df Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 14 Sep 2022 06:44:04 +0800
Subject: [PATCH 098/175] [to #44857956]fix: disable git command
 username/password prompt         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10106602

    * [to #44857956]fix: disable git command username/password prompt
---
 modelscope/hub/git.py                    |  9 ++++++++-
 tests/hub/test_hub_private_repository.py | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 08eec3ff..264cd59a 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -39,14 +39,21 @@ class GitCommandWrapper(metaclass=Singleton):
             subprocess.CompletedProcess: the command response
         """
         logger.debug(' '.join(args))
+        git_env = os.environ.copy()
+        git_env['GIT_TERMINAL_PROMPT'] = '0'
         response = subprocess.run(
             [self.git_path, *args],
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)  # compatible for python3.6
+            stderr=subprocess.PIPE,
+            env=git_env,
+        )  # compatible for python3.6
         try:
             response.check_returncode()
             return response
         except subprocess.CalledProcessError as error:
+            logger.error(
+                'There are error run git command, you may need to login first.'
+            )
             raise GitError(
                 'stdout: %s, stderr: %s' %
                 (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index 8683a884..dab2b891 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -10,7 +10,8 @@ from modelscope.hub.errors import GitError
 from modelscope.hub.repository import Repository
 from modelscope.utils.constant import ModelFile
 from .test_utils import (TEST_ACCESS_TOKEN1, TEST_ACCESS_TOKEN2,
-                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG)
+                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG,
+                         delete_credential)
 
 DEFAULT_GIT_PATH = 'git'
 
@@ -65,6 +66,18 @@ class HubPrivateRepositoryTest(unittest.TestCase):
         print(repo2.model_dir)
         assert repo1.model_dir == repo2.model_dir
 
+    def test_clone_private_model_without_token(self):
+        delete_credential()
+        temporary_dir = tempfile.mkdtemp()
+        local_dir = os.path.join(temporary_dir, self.model_name)
+        with self.assertRaises(GitError) as cm:
+            Repository(local_dir, clone_from=self.model_id)
+
+        print(cm.exception)
+        assert not os.path.exists(os.path.join(local_dir, ModelFile.README))
+
+        self.api.login(TEST_ACCESS_TOKEN1)  # re-login for delete
+
 
 if __name__ == '__main__':
     unittest.main()

From 77cfcf0a9acbbfb5f122a65cb4ce235944596146 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Wed, 14 Sep 2022 19:04:56 +0800
Subject: [PATCH 099/175] [to #42322933] commit nlp_convai_text2sql_pretrain_cn
 inference process to modelscope

commit nlp_convai_text2sql_pretrain_cn inference process to modelscope
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10025155
---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/star3/__init__.py       |    0
 .../models/nlp/star3/configuration_star3.py   |  128 +++
 modelscope/models/nlp/star3/modeling_star3.py | 1023 +++++++++++++++++
 .../models/nlp/table_question_answering.py    |  747 ++++++++++++
 modelscope/outputs.py                         |    8 +
 modelscope/pipelines/builder.py               |    3 +
 modelscope/pipelines/nlp/__init__.py          |    3 +
 .../nlp/table_question_answering_pipeline.py  |  284 +++++
 modelscope/preprocessors/__init__.py          |    2 +
 modelscope/preprocessors/star3/__init__.py    |   24 +
 .../preprocessors/star3/fields/__init__.py    |    0
 .../preprocessors/star3/fields/database.py    |   77 ++
 .../preprocessors/star3/fields/schema_link.py |  423 +++++++
 .../preprocessors/star3/fields/struct.py      |  181 +++
 .../table_question_answering_preprocessor.py  |  118 ++
 modelscope/utils/nlp/nlp_utils.py             |   17 +-
 .../test_table_question_answering.py          |   76 ++
 19 files changed, 3118 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/nlp/star3/__init__.py
 create mode 100644 modelscope/models/nlp/star3/configuration_star3.py
 create mode 100644 modelscope/models/nlp/star3/modeling_star3.py
 create mode 100644 modelscope/models/nlp/table_question_answering.py
 create mode 100644 modelscope/pipelines/nlp/table_question_answering_pipeline.py
 create mode 100644 modelscope/preprocessors/star3/__init__.py
 create mode 100644 modelscope/preprocessors/star3/fields/__init__.py
 create mode 100644 modelscope/preprocessors/star3/fields/database.py
 create mode 100644 modelscope/preprocessors/star3/fields/schema_link.py
 create mode 100644 modelscope/preprocessors/star3/fields/struct.py
 create mode 100644 modelscope/preprocessors/star3/table_question_answering_preprocessor.py
 create mode 100644 tests/pipelines/test_table_question_answering.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index e5c3873b..80a522b2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -55,6 +55,7 @@ class Models(object):
     space_intent = 'space-intent'
     space_modeling = 'space-modeling'
     star = 'star'
+    star3 = 'star3'
     tcrf = 'transformer-crf'
     transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
@@ -193,6 +194,7 @@ class Pipelines(object):
     plug_generation = 'plug-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_pipeline = 'table-question-answering-pipeline'
     sentence_embedding = 'sentence-embedding'
     passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
@@ -296,6 +298,7 @@ class Preprocessors(object):
     fill_mask_ponet = 'fill-mask-ponet'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_preprocessor = 'table-question-answering-preprocessor'
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
 
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index d411f1fb..443cb214 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -24,6 +24,7 @@ if TYPE_CHECKING:
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
+    from .table_question_answering import TableQuestionAnswering
     from .task_models import (InformationExtractionModel,
                               SequenceClassificationModel,
                               SingleBackboneTaskModelBase,
@@ -64,6 +65,7 @@ else:
             'SingleBackboneTaskModelBase', 'TokenClassificationModel'
         ],
         'token_classification': ['SbertForTokenClassification'],
+        'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
         'passage_ranking': ['PassageRanking'],
     }
diff --git a/modelscope/models/nlp/star3/__init__.py b/modelscope/models/nlp/star3/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/star3/configuration_star3.py b/modelscope/models/nlp/star3/configuration_star3.py
new file mode 100644
index 00000000..d49c70c9
--- /dev/null
+++ b/modelscope/models/nlp/star3/configuration_star3.py
@@ -0,0 +1,128 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT configuration."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import json
+import numpy as np
+import torch
+import torch_scatter
+from icecream import ic
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+logger = logging.getLogger(__name__)
+
+
+class Star3Config(object):
+    """Configuration class to store the configuration of a `Star3Model`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs Star3Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `Star3Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into `Star3Model`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(
+                    vocab_size_or_config_json_file, 'r',
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError(
+                'First argument must be either a vocabulary size (int)'
+                'or the path to a pretrained model config file (str)')
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Star3Config` from a Python dictionary of parameters."""
+        config = Star3Config(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `Star3Config` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
diff --git a/modelscope/models/nlp/star3/modeling_star3.py b/modelscope/models/nlp/star3/modeling_star3.py
new file mode 100644
index 00000000..ed5ea1b3
--- /dev/null
+++ b/modelscope/models/nlp/star3/modeling_star3.py
@@ -0,0 +1,1023 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import json
+import numpy as np
+import torch
+import torch_scatter
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+CONFIG_NAME = ModelFile.CONFIGURATION
+WEIGHTS_NAME = ModelFile.TORCH_MODEL_BIN_FILE
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.match_type_embeddings = nn.Embedding(11, config.hidden_size)
+        self.type_embeddings = nn.Embedding(6, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_type_ids=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_idx=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        header_embeddings = self.word_embeddings(header_ids)
+
+        # header mean pooling
+        header_flatten_embeddings = self.word_embeddings(header_flatten_tokens)
+        header_flatten_index = header_flatten_index.reshape(
+            (-1, header_flatten_index.shape[1], 1))
+        header_flatten_index = header_flatten_index.repeat(
+            1, 1, header_flatten_embeddings.shape[2])
+        header_flatten_output = header_flatten_output.reshape(
+            (-1, header_flatten_output.shape[1], 1))
+        header_flatten_output = header_flatten_output.repeat(
+            1, 1, header_flatten_embeddings.shape[2])
+        header_embeddings = torch_scatter.scatter_mean(
+            header_flatten_embeddings,
+            header_flatten_index,
+            out=header_flatten_output,
+            dim=1)
+        token_column_id = token_column_id.reshape(
+            (-1, token_column_id.shape[1], 1))
+        token_column_id = token_column_id.repeat(
+            (1, 1, header_embeddings.shape[2]))
+        token_column_mask = token_column_mask.reshape(
+            (-1, token_column_mask.shape[1], 1))
+        token_column_mask = token_column_mask.repeat(
+            (1, 1, header_embeddings.shape[2]))
+        token_header_embeddings = torch.gather(header_embeddings, 1,
+                                               token_column_id)
+        words_embeddings = words_embeddings * (1.0 - token_column_mask) + \
+            token_header_embeddings * token_column_mask
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+
+        if match_type_ids is not None:
+            match_type_embeddings = self.match_type_embeddings(match_type_ids)
+            embeddings += match_type_embeddings
+
+        if type_idx is not None:
+            type_embeddings = self.type_embeddings(type_idx)
+            embeddings += type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfAttentionWithRelationsRAT(nn.Module):
+    '''
+    Adapted from https://github.com/microsoft/rat-sql/blob/master/ratsql/models/transformer.py
+    '''
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsRAT, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.relation_k_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+        self.relation_v_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        relation_k = self.relation_k_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+        relation_v = self.relation_v_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+
+        # relation_k_t is [batch, seq len, head dim, seq len]
+        relation_k_t = relation_k.transpose(-2, -1)
+        # query_layer_t is [batch, seq len, num attn heads, head dim]
+        query_layer_t = query_layer.permute(0, 2, 1, 3)
+        # relation_attention_scores is [batch, seq len, num attn heads, seq len]
+        relation_attention_scores = torch.matmul(query_layer_t, relation_k_t)
+        # relation_attention_scores_t is [batch, num attn heads, seq len, seq len]
+        relation_attention_scores_t = relation_attention_scores.permute(
+            0, 2, 1, 3)
+
+        merged_attention_scores = (attention_scores
+                                   + relation_attention_scores_t) / math.sqrt(
+                                       self.attention_head_size)
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        # attention_probs_t is [batch, seq len, num attn heads, seq len]
+        attention_probs_t = attention_probs.permute(0, 2, 1, 3)
+
+        #   [batch, seq len, num attn heads, seq len]
+        # * [batch, seq len, seq len, head dim]
+        # = [batch, seq len, num attn heads, head dim]
+        context_relation = torch.matmul(attention_probs_t, relation_v)
+
+        # context_relation_t is [batch, num attn heads, seq len, head dim]
+        context_relation_t = context_relation.permute(0, 2, 1, 3)
+
+        merged_context_layer = context_layer + context_relation_t
+        merged_context_layer = merged_context_layer.permute(0, 2, 1,
+                                                            3).contiguous()
+        new_context_layer_shape = merged_context_layer.size()[:-2] + (
+            self.all_head_size, )
+        merged_context_layer = merged_context_layer.view(
+            *new_context_layer_shape)
+        return merged_context_layer
+
+
+class BertSelfAttentionWithRelationsTableformer(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsTableformer, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.schema_link_embeddings = nn.Embedding(7, self.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        schema_link_embeddings = self.schema_link_embeddings(
+            relation)  # [batch, seq len, seq len, 1]
+        schema_link_embeddings = schema_link_embeddings.permute(0, 3, 1, 2)
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        merged_attention_scores = attention_scores + schema_link_embeddings
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertAttention, self).__init__()
+        if schema_link_module == 'none':
+            self.self = BertSelfAttention(config)
+        if schema_link_module == 'rat':
+            self.self = BertSelfAttentionWithRelationsRAT(config)
+        if schema_link_module == 'add':
+            self.self = BertSelfAttentionWithRelationsTableformer(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, schema_link_matrix=None):
+        self_output = self.self(input_tensor, attention_mask,
+                                schema_link_matrix)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(
+            config, schema_link_module=schema_link_module)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        attention_output = self.attention(hidden_states, attention_mask,
+                                          schema_link_matrix)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SqlBertEncoder(nn.Module):
+
+    def __init__(self, layers, config):
+        super(SqlBertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config, schema_link_module=schema_link_module)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask,
+                                         all_schema_link_matrix)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            bert_model_embedding_weights.size(1),
+            bert_model_embedding_weights.size(0),
+            bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, Star3Config):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `Star3Config`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name,
+                        state_dict=None,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object)
+                to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        resolved_archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info('extracting archive file {} to temp dir {}'.format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = Star3Config.from_json_file(config_file)
+        logger.info('Model config {}'.format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(
+                prefix[:-1], {})
+            module._load_from_state_dict(state_dict, prefix, local_metadata,
+                                         True, missing_keys, unexpected_keys,
+                                         error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            logger.info(
+                'Weights of {} not initialized from pretrained model: {}'.
+                format(model.__class__.__name__, missing_keys))
+            print()
+            print('*' * 10, 'WARNING missing weights', '*' * 10)
+            print('Weights of {} not initialized from pretrained model: {}'.
+                  format(model.__class__.__name__, missing_keys))
+            print()
+        if len(unexpected_keys) > 0:
+            logger.info(
+                'Weights from pretrained model not used in {}: {}'.format(
+                    model.__class__.__name__, unexpected_keys))
+            print()
+            print('*' * 10, 'WARNING unexpected weights', '*' * 10)
+            print('Weights from pretrained model not used in {}: {}'.format(
+                model.__class__.__name__, unexpected_keys))
+            print()
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+class Star3Model(PreTrainedBertModel):
+    """Star3Model model ("Bidirectional Embedding Representations from a Transformer pretrained on STAR3.0").
+
+    Params:
+        config: a Star3Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output
+            as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.Star3Config(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.Star3Model(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, schema_link_module='none'):
+        super(Star3Model, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(
+            config, schema_link_module=schema_link_module)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_order_ids=None,
+                token_type_ids=None,
+                attention_mask=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_ids=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # Bowen: comment out the following line for Pytorch >= 1.5
+        # https://github.com/huggingface/transformers/issues/3936#issuecomment-793764416
+        # extended_attention_mask = extended_attention_mask.to(self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(
+            input_ids, header_ids, token_type_ids, match_type_ids, l_hs,
+            header_len, type_ids, col_dict_list, ids, header_flatten_tokens,
+            header_flatten_index, header_flatten_output, token_column_id,
+            token_column_mask, column_start_index, headers_length)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class Seq2SQL(nn.Module):
+
+    def __init__(self, iS, hS, lS, dr, n_cond_ops, n_agg_ops, n_action_ops,
+                 max_select_num, max_where_num, device):
+        super(Seq2SQL, self).__init__()
+        self.iS = iS
+        self.hS = hS
+        self.ls = lS
+        self.dr = dr
+        self.device = device
+
+        self.n_agg_ops = n_agg_ops
+        self.n_cond_ops = n_cond_ops
+        self.n_action_ops = n_action_ops
+        self.max_select_num = max_select_num
+        self.max_where_num = max_where_num
+
+        self.w_sss_model = nn.Linear(iS, max_where_num)
+        self.w_sse_model = nn.Linear(iS, max_where_num)
+        self.s_ht_model = nn.Linear(iS, max_select_num)
+        self.wc_ht_model = nn.Linear(iS, max_where_num)
+
+        self.select_agg_model = nn.Linear(iS * max_select_num,
+                                          n_agg_ops * max_select_num)
+        self.w_op_model = nn.Linear(iS * max_where_num,
+                                    n_cond_ops * max_where_num)
+
+        self.conn_model = nn.Linear(iS, 3)
+        self.action_model = nn.Linear(iS, n_action_ops + 1)
+        self.slen_model = nn.Linear(iS, max_select_num + 1)
+        self.wlen_model = nn.Linear(iS, max_where_num + 1)
+
+    def forward(self, wemb_layer, l_n, l_hs, start_index, column_index, tokens,
+                ids):
+        # chunk input lists for multi-gpu
+        max_l_n = max(l_n)
+        max_l_hs = max(l_hs)
+        l_n = np.array(l_n)[ids.cpu().numpy()].tolist()
+        l_hs = np.array(l_hs)[ids.cpu().numpy()].tolist()
+        start_index = np.array(start_index)[ids.cpu().numpy()].tolist()
+        column_index = np.array(column_index)[ids.cpu().numpy()].tolist()
+        # tokens = np.array(tokens)[ids.cpu().numpy()].tolist()
+
+        conn_index = []
+        slen_index = []
+        wlen_index = []
+        action_index = []
+        where_op_index = []
+        select_agg_index = []
+        header_pos_index = []
+        query_index = []
+        for ib, elem in enumerate(start_index):
+            # [SEP] conn [SEP] wlen [SEP] (wop [SEP])*wn slen [SEP] (agg [SEP])*sn
+            action_index.append(elem + 1)
+            conn_index.append(elem + 2)
+            wlen_index.append(elem + 3)
+            woi = [elem + 4 + i for i in range(self.max_where_num)]
+
+            slen_index.append(elem + 4 + self.max_where_num)
+            sai = [
+                elem + 5 + self.max_where_num + i
+                for i in range(self.max_select_num)
+            ]
+            where_op_index.append(woi)
+            select_agg_index.append(sai)
+
+            qilist = [i for i in range(l_n[ib] + 2)] + [l_n[ib] + 1] * (
+                max_l_n - l_n[ib])
+            query_index.append(qilist)
+
+            index = [column_index[ib] + i for i in range(0, l_hs[ib], 1)]
+            index += [index[0] for _ in range(max_l_hs - len(index))]
+            header_pos_index.append(index)
+
+        # print("tokens: ", tokens)
+        # print("conn_index: ", conn_index, "start_index: ", start_index)
+        conn_index = torch.tensor(conn_index, dtype=torch.long).to(self.device)
+        slen_index = torch.tensor(slen_index, dtype=torch.long).to(self.device)
+        wlen_index = torch.tensor(wlen_index, dtype=torch.long).to(self.device)
+        action_index = torch.tensor(
+            action_index, dtype=torch.long).to(self.device)
+        where_op_index = torch.tensor(
+            where_op_index, dtype=torch.long).to(self.device)
+        select_agg_index = torch.tensor(
+            select_agg_index, dtype=torch.long).to(self.device)
+        query_index = torch.tensor(
+            query_index, dtype=torch.long).to(self.device)
+        header_index = torch.tensor(
+            header_pos_index, dtype=torch.long).to(self.device)
+
+        bS = len(l_n)
+        conn_emb = torch.zeros([bS, self.iS]).to(self.device)
+        slen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wlen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        action_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wo_emb = torch.zeros([bS, self.max_where_num, self.iS]).to(self.device)
+        sa_emb = torch.zeros([bS, self.max_select_num,
+                              self.iS]).to(self.device)
+        qv_emb = torch.zeros([bS, max_l_n + 2, self.iS]).to(self.device)
+        ht_emb = torch.zeros([bS, max_l_hs, self.iS]).to(self.device)
+        for i in range(bS):
+            conn_emb[i, :] = wemb_layer[i].index_select(0, conn_index[i])
+            slen_emb[i, :] = wemb_layer[i].index_select(0, slen_index[i])
+            wlen_emb[i, :] = wemb_layer[i].index_select(0, wlen_index[i])
+            action_emb[i, :] = wemb_layer[i].index_select(0, action_index[i])
+
+            wo_emb[i, :, :] = wemb_layer[i].index_select(
+                0, where_op_index[i, :])
+            sa_emb[i, :, :] = wemb_layer[i].index_select(
+                0, select_agg_index[i, :])
+            qv_emb[i, :, :] = wemb_layer[i].index_select(0, query_index[i, :])
+            ht_emb[i, :, :] = wemb_layer[i].index_select(0, header_index[i, :])
+
+        s_cco = self.conn_model(conn_emb.reshape(-1, self.iS)).reshape(bS, 3)
+        s_slen = self.slen_model(slen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_select_num + 1)
+        s_wlen = self.wlen_model(wlen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_where_num + 1)
+        s_action = self.action_model(action_emb.reshape(-1, self.iS)).reshape(
+            bS, self.n_action_ops + 1)
+        wo_output = self.w_op_model(
+            wo_emb.reshape(-1, self.iS * self.max_where_num)).reshape(
+                bS, -1, self.n_cond_ops)
+
+        wc_output = self.wc_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        wv_ss = self.w_sss_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+        wv_se = self.w_sse_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        sc_output = self.s_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_select_num).transpose(1, 2)
+        sa_output = self.select_agg_model(
+            sa_emb.reshape(-1, self.iS * self.max_select_num)).reshape(
+                bS, -1, self.n_agg_ops)
+
+        return s_action, sc_output, sa_output, s_cco, wc_output, wo_output, (
+            wv_ss, wv_se), (s_slen, s_wlen)
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
new file mode 100644
index 00000000..19fdf178
--- /dev/null
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -0,0 +1,747 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Dict, Optional
+
+import numpy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.models.nlp.star3.modeling_star3 import Seq2SQL, Star3Model
+from modelscope.preprocessors.star3.fields.struct import Constant
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import verify_device
+
+__all__ = ['TableQuestionAnswering']
+
+
+@MODELS.register_module(
+    Tasks.table_question_answering, module_name=Models.star3)
+class TableQuestionAnswering(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the table-question-answering model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model_dir, ModelFile.VOCAB_FILE))
+        device_name = kwargs.get('device', 'gpu')
+        verify_device(device_name)
+        self._device_name = device_name
+
+        state_dict = torch.load(
+            os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+
+        self.backbone_config = Star3Config.from_json_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.backbone_model = Star3Model(
+            config=self.backbone_config, schema_link_module='rat')
+        self.backbone_model.load_state_dict(state_dict['backbone_model'])
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+        n_cond_ops = len(self.cond_ops)
+        n_agg_ops = len(self.agg_ops)
+        n_action_ops = len(self.action_ops)
+        iS = self.backbone_config.hidden_size
+        self.head_model = Seq2SQL(iS, 100, 2, 0.0, n_cond_ops, n_agg_ops,
+                                  n_action_ops, self.max_select_num,
+                                  self.max_where_num, self._device_name)
+        self.head_model.load_state_dict(state_dict['head_model'], strict=False)
+
+        self.backbone_model.to(self._device_name)
+        self.head_model.to(self._device_name)
+
+    def convert_string(self, pr_wvi, nlu, nlu_tt):
+        convs = []
+        for b, nlu1 in enumerate(nlu):
+            conv_dict = {}
+            nlu_tt1 = nlu_tt[b]
+            idx = 0
+            convflag = True
+            for i, ntok in enumerate(nlu_tt1):
+                if idx >= len(nlu1):
+                    convflag = False
+                    break
+
+                if ntok.startswith('##'):
+                    ntok = ntok.replace('##', '')
+
+                tok = nlu1[idx:idx + 1].lower()
+                if ntok == tok:
+                    conv_dict[i] = [idx, idx + 1]
+                    idx += 1
+                elif ntok == '#':
+                    conv_dict[i] = [idx, idx]
+                elif ntok == '[UNK]':
+                    conv_dict[i] = [idx, idx + 1]
+                    j = i + 1
+                    idx += 1
+                    if idx < len(nlu1) and j < len(
+                            nlu_tt1) and nlu_tt1[j] != '[UNK]':
+                        while idx < len(nlu1):
+                            val = nlu1[idx:idx + 1].lower()
+                            if nlu_tt1[j].startswith(val):
+                                break
+                            idx += 1
+                        conv_dict[i][1] = idx
+                elif tok in ntok:
+                    startid = idx
+                    idx += 1
+                    while idx < len(nlu1):
+                        tok += nlu1[idx:idx + 1].lower()
+                        if ntok == tok:
+                            conv_dict[i] = [startid, idx + 1]
+                            break
+                        idx += 1
+                    idx += 1
+                else:
+                    convflag = False
+
+            conv = []
+            if convflag:
+                for pr_wvi1 in pr_wvi[b]:
+                    s1, e1 = conv_dict[pr_wvi1[0]]
+                    s2, e2 = conv_dict[pr_wvi1[1]]
+                    newidx = pr_wvi1[1]
+                    while newidx + 1 < len(
+                            nlu_tt1) and s2 == e2 and nlu_tt1[newidx] == '#':
+                        newidx += 1
+                        s2, e2 = conv_dict[newidx]
+                    if newidx + 1 < len(nlu_tt1) and nlu_tt1[
+                            newidx + 1].startswith('##'):
+                        s2, e2 = conv_dict[newidx + 1]
+                    phrase = nlu1[s1:e2]
+                    conv.append(phrase)
+            else:
+                for pr_wvi1 in pr_wvi[b]:
+                    phrase = ''.join(nlu_tt1[pr_wvi1[0]:pr_wvi1[1]
+                                             + 1]).replace('##', '')
+                    conv.append(phrase)
+            convs.append(conv)
+
+        return convs
+
+    def get_fields_info(self, t1s, tables, train=True):
+        nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link = \
+            [], [], [], [], [], [], [], [], [], [], []
+        for t1 in t1s:
+            nlu.append(t1['question'])
+            nlu_t.append(t1['question_tok'])
+            hs_t.append(t1['header_tok'])
+            q_know.append(t1['bertindex_knowledge'])
+            t_know.append(t1['header_knowledge'])
+            types.append(t1['types'])
+            units.append(t1['units'])
+            his_sql.append(t1.get('history_sql', None))
+            schema_link.append(t1.get('schema_link', []))
+            if train:
+                action.append(t1.get('action', [0]))
+                sql_i.append(t1['sql'])
+
+        return nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link
+
+    def get_history_select_where(self, his_sql, header_len):
+        if his_sql is None:
+            return [0], [0]
+
+        sel = []
+        for seli in his_sql['sel']:
+            if seli + 1 < header_len and seli + 1 not in sel:
+                sel.append(seli + 1)
+
+        whe = []
+        for condi in his_sql['conds']:
+            if condi[0] + 1 < header_len and condi[0] + 1 not in whe:
+                whe.append(condi[0] + 1)
+
+        if len(sel) == 0:
+            sel.append(0)
+        if len(whe) == 0:
+            whe.append(0)
+
+        sel.sort()
+        whe.sort()
+
+        return sel, whe
+
+    def get_types_ids(self, col_type):
+        for key, type_ids in self.col_type_dict.items():
+            if key in col_type.lower():
+                return type_ids
+        return self.col_type_dict['null']
+
+    def generate_inputs(self, nlu1_tok, hs_t_1, type_t, unit_t, his_sql,
+                        q_know, t_know, s_link):
+        tokens = []
+        orders = []
+        types = []
+        segment_ids = []
+        matchs = []
+        col_dict = {}
+        schema_tok = []
+
+        tokens.append('[CLS]')
+        orders.append(0)
+        types.append(0)
+        i_st_nlu = len(tokens)
+
+        matchs.append(0)
+        segment_ids.append(0)
+        for idx, token in enumerate(nlu1_tok):
+            if q_know[idx] == 100:
+                break
+            elif q_know[idx] >= 5:
+                matchs.append(1)
+            else:
+                matchs.append(q_know[idx] + 1)
+            tokens.append(token)
+            orders.append(0)
+            types.append(0)
+            segment_ids.append(0)
+
+        i_ed_nlu = len(tokens)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(0)
+
+        sel, whe = self.get_history_select_where(his_sql, len(hs_t_1))
+
+        if len(sel) == 1 and sel[0] == 0 \
+                and len(whe) == 1 and whe[0] == 0:
+            pass
+        else:
+            tokens.append('select')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for seli in sel:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = seli
+
+            tokens.append('where')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for whei in whe:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = whei
+
+            tokens.append('[SEP]')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+        column_start = len(tokens)
+        i_hds_f = []
+        header_flatten_tokens, header_flatten_index = [], []
+        for i, hds11 in enumerate(hs_t_1):
+            if len(unit_t[i]) == 1 and unit_t[i][0] == 'null':
+                temp_header_tokens = hds11
+            else:
+                temp_header_tokens = hds11 + unit_t[i]
+            schema_tok.append(temp_header_tokens)
+            header_flatten_tokens.extend(temp_header_tokens)
+            header_flatten_index.extend([i + 1] * len(temp_header_tokens))
+            i_st_hd_f = len(tokens)
+            tokens += ['[PAD]']
+            orders.append(0)
+            types.append(self.get_types_ids(type_t[i]))
+            i_ed_hd_f = len(tokens)
+            col_dict[len(tokens) - 1] = i
+            i_hds_f.append((i_st_hd_f, i_ed_hd_f))
+            if i == 0:
+                matchs.append(6)
+            else:
+                matchs.append(t_know[i - 1] + 6)
+            segment_ids.append(1)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        # position where
+        # [SEP]
+        start_ids = len(tokens) - 1
+
+        tokens.append('action')  # action
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('connect')  # column
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('allen')  # select len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_where_num):
+            tokens.append('act')  # op
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        tokens.append('size')  # where len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_select_num):
+            tokens.append('focus')  # agg
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        i_nlu = (i_st_nlu, i_ed_nlu)
+
+        schema_link_matrix = numpy.zeros((len(tokens), len(tokens)),
+                                         dtype='int32')
+        schema_link_mask = numpy.zeros((len(tokens), len(tokens)),
+                                       dtype='float32')
+        for relation in s_link:
+            if relation['label'] in ['col', 'val']:
+                [q_st, q_ed] = relation['question_index']
+                cid = max(0, relation['column_index'])
+                schema_link_matrix[
+                    i_st_nlu + q_st: i_st_nlu + q_ed + 1,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_middle']
+                schema_link_matrix[
+                    i_st_nlu + q_st,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_start']
+                schema_link_matrix[
+                    i_st_nlu + q_ed,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_end']
+                schema_link_mask[i_st_nlu + q_st:i_st_nlu + q_ed + 1,
+                                 column_start + cid + 1:column_start + cid + 1
+                                 + 1] = 1.0
+
+        return tokens, orders, types, segment_ids, matchs, \
+            i_nlu, i_hds_f, start_ids, column_start, col_dict, schema_tok, \
+            header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask
+
+    def gen_l_hpu(self, i_hds):
+        """
+        Treat columns as if it is a batch of natural language utterance
+        with batch-size = # of columns * # of batch_size
+        i_hds = [(17, 18), (19, 21), (22, 23), (24, 25), (26, 29), (30, 34)])
+        """
+        l_hpu = []
+        for i_hds1 in i_hds:
+            for i_hds11 in i_hds1:
+                l_hpu.append(i_hds11[1] - i_hds11[0])
+
+        return l_hpu
+
+    def get_bert_output(self, model_bert, tokenizer, nlu_t, hs_t, col_types,
+                        units, his_sql, q_know, t_know, schema_link):
+        """
+        Here, input is toknized further by WordPiece (WP) tokenizer and fed into BERT.
+
+        INPUT
+        :param model_bert:
+        :param tokenizer: WordPiece toknizer
+        :param nlu: Question
+        :param nlu_t: CoreNLP tokenized nlu.
+        :param hds: Headers
+        :param hs_t: None or 1st-level tokenized headers
+        :param max_seq_length: max input token length
+
+        OUTPUT
+        tokens: BERT input tokens
+        nlu_tt: WP-tokenized input natural language questions
+        orig_to_tok_index: map the index of 1st-level-token to the index of 2nd-level-token
+        tok_to_orig_index: inverse map.
+
+        """
+
+        l_n = []
+        l_hs = []  # The length of columns for each batch
+
+        input_ids = []
+        order_ids = []
+        type_ids = []
+        segment_ids = []
+        match_ids = []
+        input_mask = []
+
+        i_nlu = [
+        ]  # index to retreive the position of contextual vector later.
+        i_hds = []
+        tokens = []
+        orders = []
+        types = []
+        matchs = []
+        segments = []
+        schema_link_matrix_list = []
+        schema_link_mask_list = []
+        start_index = []
+        column_index = []
+        col_dict_list = []
+        header_list = []
+        header_flatten_token_list = []
+        header_flatten_tokenid_list = []
+        header_flatten_index_list = []
+
+        header_tok_max_len = 0
+        cur_max_length = 0
+
+        for b, nlu_t1 in enumerate(nlu_t):
+            hs_t1 = [hs_t[b][-1]] + hs_t[b][:-1]
+            type_t1 = [col_types[b][-1]] + col_types[b][:-1]
+            unit_t1 = [units[b][-1]] + units[b][:-1]
+            l_hs.append(len(hs_t1))
+
+            # [CLS] nlu [SEP] col1 [SEP] col2 [SEP] ...col-n [SEP]
+            # 2. Generate BERT inputs & indices.
+            tokens1, orders1, types1, segment1, match1, i_nlu1, i_hds_1, \
+                start_idx, column_start, col_dict, schema_tok, \
+                header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask = \
+                self.generate_inputs(
+                    nlu_t1, hs_t1, type_t1, unit_t1, his_sql[b],
+                    q_know[b], t_know[b], schema_link[b])
+
+            l_n.append(i_nlu1[1] - i_nlu1[0])
+            start_index.append(start_idx)
+            column_index.append(column_start)
+            col_dict_list.append(col_dict)
+            tokens.append(tokens1)
+            orders.append(orders1)
+            types.append(types1)
+            segments.append(segment1)
+            matchs.append(match1)
+            i_nlu.append(i_nlu1)
+            i_hds.append(i_hds_1)
+            schema_link_matrix_list.append(schema_link_matrix)
+            schema_link_mask_list.append(schema_link_mask)
+            header_flatten_token_list.append(header_flatten_tokens)
+            header_flatten_index_list.append(header_flatten_index)
+            header_list.append(schema_tok)
+            header_max = max([len(schema_tok1) for schema_tok1 in schema_tok])
+            if header_max > header_tok_max_len:
+                header_tok_max_len = header_max
+
+            if len(tokens1) > cur_max_length:
+                cur_max_length = len(tokens1)
+
+            if len(tokens1) > 512:
+                print('input too long!!! total_num:%d\t question:%s' %
+                      (len(tokens1), ''.join(nlu_t1)))
+
+        assert cur_max_length <= 512
+
+        for i, tokens1 in enumerate(tokens):
+            segment_ids1 = segments[i]
+            order_ids1 = orders[i]
+            type_ids1 = types[i]
+            match_ids1 = matchs[i]
+            input_ids1 = tokenizer.convert_tokens_to_ids(tokens1)
+            input_mask1 = [1] * len(input_ids1)
+
+            while len(input_ids1) < cur_max_length:
+                input_ids1.append(0)
+                input_mask1.append(0)
+                segment_ids1.append(0)
+                order_ids1.append(0)
+                type_ids1.append(0)
+                match_ids1.append(0)
+
+            if len(input_ids1) != cur_max_length:
+                print('Error: ', nlu_t1, tokens1, len(input_ids1),
+                      cur_max_length)
+
+            assert len(input_ids1) == cur_max_length
+            assert len(input_mask1) == cur_max_length
+            assert len(order_ids1) == cur_max_length
+            assert len(segment_ids1) == cur_max_length
+            assert len(match_ids1) == cur_max_length
+            assert len(type_ids1) == cur_max_length
+
+            input_ids.append(input_ids1)
+            order_ids.append(order_ids1)
+            type_ids.append(type_ids1)
+            segment_ids.append(segment_ids1)
+            input_mask.append(input_mask1)
+            match_ids.append(match_ids1)
+
+        header_len = []
+        header_ids = []
+        header_max_len = max(
+            [len(header_list1) for header_list1 in header_list])
+        for header1 in header_list:
+            header_len1 = []
+            header_ids1 = []
+            for header_tok in header1:
+                header_len1.append(len(header_tok))
+                header_tok_ids1 = tokenizer.convert_tokens_to_ids(header_tok)
+                while len(header_tok_ids1) < header_tok_max_len:
+                    header_tok_ids1.append(0)
+                header_ids1.append(header_tok_ids1)
+            while len(header_ids1) < header_max_len:
+                header_ids1.append([0] * header_tok_max_len)
+            header_len.append(header_len1)
+            header_ids.append(header_ids1)
+
+        for i, header_flatten_token in enumerate(header_flatten_token_list):
+            header_flatten_tokenid = tokenizer.convert_tokens_to_ids(
+                header_flatten_token)
+            header_flatten_tokenid_list.append(header_flatten_tokenid)
+
+        # Convert to tensor
+        all_input_ids = torch.tensor(
+            input_ids, dtype=torch.long).to(self._device_name)
+        all_order_ids = torch.tensor(
+            order_ids, dtype=torch.long).to(self._device_name)
+        all_type_ids = torch.tensor(
+            type_ids, dtype=torch.long).to(self._device_name)
+        all_input_mask = torch.tensor(
+            input_mask, dtype=torch.long).to(self._device_name)
+        all_segment_ids = torch.tensor(
+            segment_ids, dtype=torch.long).to(self._device_name)
+        all_match_ids = torch.tensor(
+            match_ids, dtype=torch.long).to(self._device_name)
+        all_header_ids = torch.tensor(
+            header_ids, dtype=torch.long).to(self._device_name)
+        all_ids = torch.arange(
+            all_input_ids.shape[0], dtype=torch.long).to(self._device_name)
+
+        bS = len(header_flatten_tokenid_list)
+        max_header_flatten_token_length = max(
+            [len(x) for x in header_flatten_tokenid_list])
+        all_header_flatten_tokens = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        all_header_flatten_index = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        for i, header_flatten_tokenid in enumerate(
+                header_flatten_tokenid_list):
+            for j, tokenid in enumerate(header_flatten_tokenid):
+                all_header_flatten_tokens[i, j] = tokenid
+            for j, hdindex in enumerate(header_flatten_index_list[i]):
+                all_header_flatten_index[i, j] = hdindex
+        all_header_flatten_output = numpy.zeros((bS, header_max_len + 1),
+                                                dtype='int32')
+        all_header_flatten_tokens = torch.tensor(
+            all_header_flatten_tokens, dtype=torch.long).to(self._device_name)
+        all_header_flatten_index = torch.tensor(
+            all_header_flatten_index, dtype=torch.long).to(self._device_name)
+        all_header_flatten_output = torch.tensor(
+            all_header_flatten_output,
+            dtype=torch.float32).to(self._device_name)
+
+        all_token_column_id = numpy.zeros((bS, cur_max_length), dtype='int32')
+        all_token_column_mask = numpy.zeros((bS, cur_max_length),
+                                            dtype='float32')
+        for bi, col_dict in enumerate(col_dict_list):
+            for ki, vi in col_dict.items():
+                all_token_column_id[bi, ki] = vi + 1
+                all_token_column_mask[bi, ki] = 1.0
+        all_token_column_id = torch.tensor(
+            all_token_column_id, dtype=torch.long).to(self._device_name)
+        all_token_column_mask = torch.tensor(
+            all_token_column_mask, dtype=torch.float32).to(self._device_name)
+
+        all_schema_link_matrix = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='int32')
+        all_schema_link_mask = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='float32')
+        for i, schema_link_matrix in enumerate(schema_link_matrix_list):
+            temp_len = schema_link_matrix.shape[0]
+            all_schema_link_matrix[i, 0:temp_len,
+                                   0:temp_len] = schema_link_matrix
+            all_schema_link_mask[i, 0:temp_len,
+                                 0:temp_len] = schema_link_mask_list[i]
+        all_schema_link_matrix = torch.tensor(
+            all_schema_link_matrix, dtype=torch.long).to(self._device_name)
+        all_schema_link_mask = torch.tensor(
+            all_schema_link_mask, dtype=torch.long).to(self._device_name)
+
+        # 5. generate l_hpu from i_hds
+        l_hpu = self.gen_l_hpu(i_hds)
+
+        # 4. Generate BERT output.
+        all_encoder_layer, pooled_output = model_bert(
+            all_input_ids,
+            all_header_ids,
+            token_order_ids=all_order_ids,
+            token_type_ids=all_segment_ids,
+            attention_mask=all_input_mask,
+            match_type_ids=all_match_ids,
+            l_hs=l_hs,
+            header_len=header_len,
+            type_ids=all_type_ids,
+            col_dict_list=col_dict_list,
+            ids=all_ids,
+            header_flatten_tokens=all_header_flatten_tokens,
+            header_flatten_index=all_header_flatten_index,
+            header_flatten_output=all_header_flatten_output,
+            token_column_id=all_token_column_id,
+            token_column_mask=all_token_column_mask,
+            column_start_index=column_index,
+            headers_length=l_hs,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=False)
+
+        return all_encoder_layer, pooled_output, tokens, i_nlu, i_hds, \
+            l_n, l_hpu, l_hs, start_index, column_index, all_ids
+
+    def predict(self, querys):
+        self.head_model.eval()
+        self.backbone_model.eval()
+
+        nlu, nlu_t, sql_i, q_know, t_know, tb, hs_t, types, units, his_sql, schema_link = \
+            self.get_fields_info(querys, None, train=False)
+
+        with torch.no_grad():
+            all_encoder_layer, _, tokens, i_nlu, i_hds, l_n, l_hpu, l_hs, start_index, column_index, ids = \
+                self.get_bert_output(
+                    self.backbone_model, self.tokenizer,
+                    nlu_t, hs_t, types, units, his_sql, q_know, t_know, schema_link)
+
+            s_action, s_sc, s_sa, s_cco, s_wc, s_wo, s_wvs, s_len = self.head_model(
+                all_encoder_layer, l_n, l_hs, start_index, column_index,
+                tokens, ids)
+
+        action_batch = torch.argmax(F.softmax(s_action, -1), -1).cpu().tolist()
+        scco_batch = torch.argmax(F.softmax(s_cco, -1), -1).cpu().tolist()
+        sc_batch = torch.argmax(F.softmax(s_sc, -1), -1).cpu().tolist()
+        sa_batch = torch.argmax(F.softmax(s_sa, -1), -1).cpu().tolist()
+        wc_batch = torch.argmax(F.softmax(s_wc, -1), -1).cpu().tolist()
+        wo_batch = torch.argmax(F.softmax(s_wo, -1), -1).cpu().tolist()
+        s_wvs_s, s_wvs_e = s_wvs
+        wvss_batch = torch.argmax(F.softmax(s_wvs_s, -1), -1).cpu().tolist()
+        wvse_batch = torch.argmax(F.softmax(s_wvs_e, -1), -1).cpu().tolist()
+        s_slen, s_wlen = s_len
+        slen_batch = torch.argmax(F.softmax(s_slen, -1), -1).cpu().tolist()
+        wlen_batch = torch.argmax(F.softmax(s_wlen, -1), -1).cpu().tolist()
+
+        pr_wvi = []
+        for i in range(len(querys)):
+            wvi = []
+            for j in range(wlen_batch[i]):
+                wvi.append([
+                    max(0, wvss_batch[i][j] - 1),
+                    max(0, wvse_batch[i][j] - 1)
+                ])
+            pr_wvi.append(wvi)
+        pr_wvi_str = self.convert_string(pr_wvi, nlu, nlu_t)
+
+        pre_results = []
+        for ib in range(len(querys)):
+            res_one = {}
+            sql = {}
+            sql['cond_conn_op'] = scco_batch[ib]
+            sl = slen_batch[ib]
+            sql['sel'] = list(
+                numpy.array(sc_batch[ib][:sl]).astype(numpy.int32) - 1)
+            sql['agg'] = list(
+                numpy.array(sa_batch[ib][:sl]).astype(numpy.int32))
+            sels = []
+            aggs = []
+            for ia, sel in enumerate(sql['sel']):
+                if sel == -1:
+                    if sql['agg'][ia] > 0:
+                        sels.append(l_hs[ib] - 1)
+                        aggs.append(sql['agg'][ia])
+                    continue
+                sels.append(sel)
+                if sql['agg'][ia] == -1:
+                    aggs.append(0)
+                else:
+                    aggs.append(sql['agg'][ia])
+            if len(sels) == 0:
+                sels.append(l_hs[ib] - 1)
+                aggs.append(0)
+            assert len(sels) == len(aggs)
+            sql['sel'] = sels
+            sql['agg'] = aggs
+
+            conds = []
+            wl = wlen_batch[ib]
+            wc_os = list(
+                numpy.array(wc_batch[ib][:wl]).astype(numpy.int32) - 1)
+            wo_os = list(numpy.array(wo_batch[ib][:wl]).astype(numpy.int32))
+            res_one['question_tok'] = querys[ib]['question_tok']
+            for i in range(wl):
+                if wc_os[i] == -1:
+                    continue
+                conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]])
+            if len(conds) == 0:
+                conds.append([l_hs[ib] - 1, 2, 'Nulll'])
+            sql['conds'] = conds
+            res_one['question'] = querys[ib]['question']
+            res_one['table_id'] = querys[ib]['table_id']
+            res_one['sql'] = sql
+            res_one['action'] = action_batch[ib]
+            res_one['model_out'] = [
+                sc_batch[ib], sa_batch[ib], wc_batch[ib], wo_batch[ib],
+                wvss_batch[ib], wvse_batch[ib]
+            ]
+            pre_results.append(res_one)
+
+        return pre_results
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+        """
+        result = self.predict(input['datas'])[0]
+
+        return {
+            'result': result,
+            'history_sql': input['datas'][0]['history_sql']
+        }
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 8ddeb314..d7d619bf 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,7 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    HISTORY = 'history'
     TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
     SPLIT_META_DICT = 'split_meta_dict'
@@ -471,6 +472,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.conversational_text_to_sql: [OutputKeys.TEXT],
 
+    # table-question-answering result for single sample
+    # {
+    #   "sql": "SELECT shop.Name FROM shop."
+    #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
+    # }
+    Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
+
     # ============ audio tasks ===================
     # asr result for single sample
     # { "text": "每一天都要快乐喔"}
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 50313cf7..5e244b27 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -66,6 +66,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.conversational_text_to_sql:
     (Pipelines.conversational_text_to_sql,
      'damo/nlp_star_conversational-text-to-sql'),
+    Tasks.table_question_answering:
+    (Pipelines.table_question_answering_pipeline,
+     'damo/nlp-convai-text2sql-pretrain-cn'),
     Tasks.text_error_correction:
     (Pipelines.text_error_correction,
      'damo/nlp_bart_text-error-correction_chinese'),
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 6f898c0f..b5c53f82 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -5,6 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
@@ -31,6 +32,8 @@ else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
         ['ConversationalTextToSqlPipeline'],
+        'table_question_answering_pipeline':
+        ['TableQuestionAnsweringPipeline'],
         'dialog_intent_prediction_pipeline':
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
new file mode 100644
index 00000000..8235a4d6
--- /dev/null
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -0,0 +1,284 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import TableQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.struct import Constant, SQLQuery
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['TableQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.table_question_answering,
+    module_name=Pipelines.table_question_answering_pipeline)
+class TableQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[TableQuestionAnswering, str],
+                 preprocessor: TableQuestionAnsweringPreprocessor = None,
+                 db: Database = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a table question answering prediction pipeline
+
+        Args:
+            model (TableQuestionAnswering): a model instance
+            preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance
+            db (Database): a database to store tables in the database
+        """
+        model = model if isinstance(
+            model, TableQuestionAnswering) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TableQuestionAnsweringPreprocessor(model.model_dir)
+
+        # initilize tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+
+        # initialize database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(model.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(model.model_dir,
+                                                'synonym.txt'))
+        else:
+            self.db = db
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def post_process_multi_turn(self, history_sql, result, table):
+        action = self.action_ops[result['action']]
+        headers = table['header_name']
+        current_sql = result['sql']
+
+        if history_sql is None:
+            return current_sql
+
+        if action == 'out_of_scripts':
+            return history_sql
+
+        elif action == 'switch_table':
+            return current_sql
+
+        elif action == 'restart':
+            return current_sql
+
+        elif action == 'firstTurn':
+            return current_sql
+
+        elif action == 'del_focus':
+            pre_final_sql = copy.deepcopy(history_sql)
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli not in current_sql['sel']:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+
+            if len(pre_sels) < 1:
+                pre_sels.append(len(headers))
+                pre_aggs.append(0)
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_agg_only':
+            pre_final_sql = history_sql
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli in current_sql['sel']:
+                    pre_sels.append(seli)
+                    changed_aggi = -1
+                    for idx_single, aggi in enumerate(current_sql['agg']):
+                        if current_sql['sel'][idx_single] == seli:
+                            changed_aggi = aggi
+                    pre_aggs.append(changed_aggi)
+                else:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            return pre_final_sql
+
+        elif action == 'change_focus_total':
+            pre_final_sql = history_sql
+            pre_sels = current_sql['sel']
+            pre_aggs = current_sql['agg']
+
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+            for pre_condi in current_sql['conds']:
+                if pre_condi[0] < len(headers):
+                    in_flag = False
+                    for history_condi in history_sql['conds']:
+                        if pre_condi[0] == history_condi[0]:
+                            in_flag = True
+                    if not in_flag:
+                        pre_final_sql['conds'].append(pre_condi)
+
+            return pre_final_sql
+
+        elif action == 'del_cond':
+            pre_final_sql = copy.deepcopy(history_sql)
+
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                if condi[0] not in current_sql['sel']:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_cond':
+            pre_final_sql = history_sql
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                in_single_flag = False
+                for single_condi in current_sql['conds']:
+                    if condi[0] == single_condi[0]:
+                        in_single_flag = True
+                        final_conds.append(single_condi)
+                if not in_single_flag:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null', 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'add_cond':
+            pre_final_sql = history_sql
+            final_conds = pre_final_sql['conds']
+            for idx, condi in enumerate(current_sql['conds']):
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        else:
+            return current_sql
+
+    def sql_dict_to_str(self, result, table):
+        """
+        convert sql struct to string
+        """
+        header_names = table['header_name'] + ['空列']
+        header_ids = table['header_id'] + ['null']
+        sql = result['sql']
+
+        str_sel_list, sql_sel_list = [], []
+        for idx, sel in enumerate(sql['sel']):
+            header_name = header_names[sel]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[sel])
+            if sql['agg'][idx] == 0:
+                str_sel_list.append(header_name)
+                sql_sel_list.append(header_id)
+            else:
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_name + ' )')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_id + ' )')
+
+        str_cond_list, sql_cond_list = [], []
+        for cond in sql['conds']:
+            header_name = header_names[cond[0]]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
+            op = self.cond_ops[cond[1]]
+            value = cond[2]
+            str_cond_list.append('( ' + header_name + ' ' + op + ' "' + value
+                                 + '" )')
+            sql_cond_list.append('( ' + header_id + ' ' + op + ' "' + value
+                                 + '" )')
+
+        cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
+
+        final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list),
+                                                    table['table_name'],
+                                                    cond.join(str_cond_list))
+        final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list),
+                                                      table['table_id'],
+                                                      cond.join(sql_cond_list))
+        sql = SQLQuery(
+            string=final_str, query=final_sql, sql_result=result['sql'])
+
+        return sql
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = inputs['result']
+        history_sql = inputs['history_sql']
+        result['sql'] = self.post_process_multi_turn(
+            history_sql=history_sql,
+            result=result,
+            table=self.db.tables[result['table_id']])
+        sql = self.sql_dict_to_str(
+            result=result, table=self.db.tables[result['table_id']])
+        output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']}
+        return output
+
+    def _collate_fn(self, data):
+        return data
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 212339ae..04901dc5 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
                         DialogStateTrackingPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
+    from .star3 import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -62,6 +63,7 @@ else:
             'DialogStateTrackingPreprocessor', 'InputFeatures'
         ],
         'star': ['ConversationalTextToSqlPreprocessor'],
+        'star3': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/star3/__init__.py b/modelscope/preprocessors/star3/__init__.py
new file mode 100644
index 00000000..9aa562d7
--- /dev/null
+++ b/modelscope/preprocessors/star3/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .table_question_answering_preprocessor import TableQuestionAnsweringPreprocessor
+    from .fields import MultiWOZBPETextField, IntentBPETextField
+
+else:
+    _import_structure = {
+        'table_question_answering_preprocessor':
+        ['TableQuestionAnsweringPreprocessor'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/star3/fields/__init__.py b/modelscope/preprocessors/star3/fields/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
new file mode 100644
index 00000000..a99800cf
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import tqdm
+
+from modelscope.preprocessors.star3.fields.struct import Trie
+
+
+class Database:
+
+    def __init__(self, tokenizer, table_file_path, syn_dict_file_path):
+        self.tokenizer = tokenizer
+        self.tables = self.init_tables(table_file_path=table_file_path)
+        self.syn_dict = self.init_syn_dict(
+            syn_dict_file_path=syn_dict_file_path)
+
+    def init_tables(self, table_file_path):
+        tables = {}
+        lines = []
+        with open(table_file_path, 'r') as fo:
+            for line in fo:
+                lines.append(line)
+
+        for line in tqdm.tqdm(lines, desc='Load Tables'):
+            table = json.loads(line.strip())
+
+            table_header_length = 0
+            headers_tokens = []
+            for header in table['header_name']:
+                header_tokens = self.tokenizer.tokenize(header)
+                table_header_length += len(header_tokens)
+                headers_tokens.append(header_tokens)
+            empty_column = self.tokenizer.tokenize('空列')
+            table_header_length += len(empty_column)
+            headers_tokens.append(empty_column)
+            table['tablelen'] = table_header_length
+            table['header_tok'] = headers_tokens
+
+            table['header_types'].append('null')
+            table['header_units'] = [
+                self.tokenizer.tokenize(unit) for unit in table['header_units']
+            ] + [[]]
+
+            trie_set = [Trie() for _ in table['header_name']]
+            for row in table['rows']:
+                for ii, cell in enumerate(row):
+                    if 'real' in table['header_types'][ii].lower() or \
+                        'number' in table['header_types'][ii].lower() or \
+                            'duration' in table['header_types'][ii].lower():
+                        continue
+                    word = str(cell).strip().lower()
+                    trie_set[ii].insert(word, word)
+
+            table['value_trie'] = trie_set
+            tables[table['table_id']] = table
+
+        return tables
+
+    def init_syn_dict(self, syn_dict_file_path):
+        lines = []
+        with open(syn_dict_file_path, encoding='utf-8') as fo:
+            for line in fo:
+                lines.append(line)
+
+        syn_dict = {}
+        for line in tqdm.tqdm(lines, desc='Load Synonym Dict'):
+            tokens = line.strip().split('\t')
+            if len(tokens) != 2:
+                continue
+            keys = tokens[0].strip().split('|')
+            values = tokens[1].strip().split('|')
+            for key in keys:
+                key = key.lower().strip()
+                syn_dict.setdefault(key, [])
+                for value in values:
+                    syn_dict[key].append(value.lower().strip())
+
+        return syn_dict
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
new file mode 100644
index 00000000..40613f78
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -0,0 +1,423 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import re
+
+from modelscope.preprocessors.star3.fields.struct import TypeInfo
+
+
+class SchemaLinker:
+
+    def __init__(self):
+        pass
+
+    def find_in_list(self, comlist, words):
+        result = False
+        for com in comlist:
+            if words in com:
+                result = True
+                break
+        return result
+
+    def get_continue_score(self, pstr, tstr):
+        comlist = []
+        minlen = min(len(pstr), len(tstr))
+        for slen in range(minlen, 1, -1):
+            for ts in range(0, len(tstr), 1):
+                if ts + slen > len(tstr):
+                    continue
+                words = tstr[ts:ts + slen]
+                if words in pstr and not self.find_in_list(comlist, words):
+                    comlist.append(words)
+
+        comlen = 0
+        for com in comlist:
+            comlen += len(com) * len(com)
+        weight = comlen / (len(tstr) * len(tstr) + 0.001)
+        if weight > 1.0:
+            weight = 1.0
+
+        return weight
+
+    def get_match_score(self, ptokens, ttokens):
+        pset = set(ptokens)
+        tset = set(ttokens)
+        comset = pset & tset
+        allset = pset | tset
+        weight2 = len(comset) / (len(allset) + 0.001)
+        weight3 = self.get_continue_score(''.join(ptokens), ''.join(ttokens))
+        return 0.4 * weight2 + 0.6 * weight3
+
+    def is_number(self, s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            pass
+
+        try:
+            import unicodedata
+            unicodedata.numeric(s)
+            return True
+        except (TypeError, ValueError):
+            pass
+
+        return False
+
+    def get_match_phrase(self, query, target):
+        if target in query:
+            return target, 1.0
+
+        qtokens = []
+        for i in range(0, len(query), 1):
+            qtokens.append(query[i:i + 1])
+        ttokens = []
+        for i in range(0, len(target), 1):
+            ttokens.append(target[i:i + 1])
+        ttok_set = set(ttokens)
+
+        phrase = ''
+        score = 0.0
+        for qidx, qword in enumerate(qtokens):
+            if qword not in ttok_set:
+                continue
+
+            eidx = (qidx + 2 * len(ttokens)) if (
+                len(qtokens) > qidx + 2 * len(ttokens)) else len(qtokens)
+            while eidx > qidx:
+                ptokens = qtokens[qidx:eidx]
+                weight = self.get_match_score(ptokens, ttokens)
+                if weight + 0.001 > score:
+                    score = weight
+                    phrase = ''.join(ptokens)
+                eidx -= 1
+
+        if self.is_number(target) and phrase != target:
+            score = 0.0
+        if len(phrase) > 1 and phrase in target:
+            score *= (1.0 + 0.05 * len(phrase))
+
+        return phrase, score
+
+    def allfindpairidx(self, que_tok, value_tok, weight):
+        idxs = []
+        for i in range(0, len(que_tok) - len(value_tok) + 1, 1):
+            s = i
+            e = i
+            matched = True
+            for j in range(0, len(value_tok), 1):
+                if value_tok[j].lower() == que_tok[i + j].lower():
+                    e = i + j
+                else:
+                    matched = False
+                    break
+            if matched:
+                idxs.append([s, e, weight])
+
+        return idxs
+
+    def findnear(self, ps1, pe1, ps2, pe2):
+        if abs(ps1 - pe2) <= 2 or abs(pe1 - ps2) <= 2:
+            return True
+        return False
+
+    def get_column_type(self, col_idx, table):
+        colType = table['header_types'][col_idx]
+        if 'number' in colType or 'duration' in colType or 'real' in colType:
+            colType = 'real'
+        elif 'date' in colType:
+            colType = 'date'
+        elif 'bool' in colType:
+            colType = 'bool'
+        else:
+            colType = 'text'
+
+        return colType
+
+    def add_type_all(self, typeinfos, index, idxs, label, linktype, value,
+                     orgvalue):
+        for idx in idxs:
+            info = TypeInfo(label, index, linktype, value, orgvalue, idx[0],
+                            idx[1], idx[2])
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+
+            if flag:
+                typeinfos.append(info)
+
+        return typeinfos
+
+    def save_info(self, tinfo, sinfo):
+        flag = True
+        if tinfo.pstart > sinfo.pend or tinfo.pend < sinfo.pstart:
+            pass
+        elif tinfo.pstart >= sinfo.pstart and \
+                tinfo.pend <= sinfo.pend and tinfo.index == -1:
+            flag = False
+        elif tinfo.pstart == sinfo.pstart and sinfo.pend == tinfo.pend and \
+                abs(tinfo.weight - sinfo.weight) < 0.01:
+            pass
+        else:
+            if sinfo.label == 'col' or sinfo.label == 'val':
+                if tinfo.label == 'col' or tinfo.label == 'val':
+                    if (sinfo.pend
+                            - sinfo.pstart) > (tinfo.pend - tinfo.pstart) or (
+                                sinfo.weight > tinfo.weight
+                                and sinfo.index != -1):
+                        flag = False
+                else:
+                    flag = False
+            else:
+                if (tinfo.label == 'op' or tinfo.label == 'agg'):
+                    if (sinfo.pend - sinfo.pstart) > (
+                            tinfo.pend
+                            - tinfo.pstart) or sinfo.weight > tinfo.weight:
+                        flag = False
+
+        return flag
+
+    def normal_type_infos(self, infos):
+        typeinfos = []
+        for info in infos:
+            typeinfos = [x for x in typeinfos if self.save_info(x, info)]
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if not self.save_info(info, typeinfo):
+                    flag = False
+                    break
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+            if flag:
+                typeinfos.append(info)
+        return typeinfos
+
+    def findnear_typeinfo(self, info1, info2):
+        return self.findnear(info1.pstart, info1.pend, info2.pstart,
+                             info2.pend)
+
+    def find_real_column(self, infos, table):
+        for i, vinfo in enumerate(infos):
+            if vinfo.index != -1 or vinfo.label != 'val':
+                continue
+            eoidx = -1
+            for j, oinfo in enumerate(infos):
+                if oinfo.label != 'op':
+                    continue
+                if self.findnear_typeinfo(vinfo, oinfo):
+                    eoidx = j
+                    break
+            for j, cinfo in enumerate(infos):
+                if cinfo.label != 'col' or table['header_types'][
+                        cinfo.index] != 'real':
+                    continue
+                if self.findnear_typeinfo(cinfo, vinfo) or (
+                        eoidx != -1
+                        and self.findnear_typeinfo(cinfo, infos[eoidx])):
+                    infos[i].index = cinfo.index
+                    break
+
+        return infos
+
+    def filter_column_infos(self, infos):
+        delid = []
+        for i, info in enumerate(infos):
+            if info.label != 'col':
+                continue
+            for j in range(i + 1, len(infos), 1):
+                if infos[j].label == 'col' and \
+                        info.pstart == infos[j].pstart and \
+                        info.pend == infos[j].pend:
+                    delid.append(i)
+                    delid.append(j)
+                    break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def filter_type_infos(self, infos, table):
+        infos = self.filter_column_infos(infos)
+        infos = self.find_real_column(infos, table)
+
+        colvalMp = {}
+        for info in infos:
+            if info.label == 'col':
+                colvalMp[info.index] = []
+        for info in infos:
+            if info.label == 'val' and info.index in colvalMp:
+                colvalMp[info.index].append(info)
+
+        delid = []
+        for idx, info in enumerate(infos):
+            if info.label != 'val' or info.index in colvalMp:
+                continue
+            for index in colvalMp.keys():
+                valinfos = colvalMp[index]
+                for valinfo in valinfos:
+                    if valinfo.pstart <= info.pstart and \
+                            valinfo.pend >= info.pend:
+                        delid.append(idx)
+                        break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def get_table_match_score(self, nlu_t, schema_link):
+        match_len = 0
+        for info in schema_link:
+            scale = 0.6
+            if info['question_len'] > 0 and info['column_index'] != -1:
+                scale = 1.0
+            else:
+                scale = 0.5
+            match_len += scale * info['question_len'] * info['weight']
+
+        return match_len / (len(nlu_t) + 0.1)
+
+    def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict):
+        """
+        get linking between question and schema column
+        """
+        typeinfos = []
+        numbers = re.findall(r'[-]?\d*\.\d+|[-]?\d+|\d+', nlu)
+
+        # search schema link in every table
+        search_result_list = []
+        for tablename in tables:
+            table = tables[tablename]
+            trie_set = None
+            if 'value_trie' in table:
+                trie_set = table['value_trie']
+
+            typeinfos = []
+            for ii, column in enumerate(table['header_name']):
+                column = column.lower()
+                column_new = re.sub('(.*?)', '', column)
+                column_new = re.sub('（.*?）', '', column_new)
+                cphrase, cscore = self.get_match_phrase(
+                    nlu.lower(), column_new)
+                if cscore > 0.3 and cphrase.strip() != '':
+                    phrase_tok = tokenizer.tokenize(cphrase)
+                    cidxs = self.allfindpairidx(nlu_t, phrase_tok, cscore)
+                    typeinfos = self.add_type_all(typeinfos, ii, cidxs, 'col',
+                                                  'column', cphrase, column)
+                if cscore < 0.8 and column_new in col_syn_dict:
+                    columns = list(set(col_syn_dict[column_new]))
+                    for syn_col in columns:
+                        if syn_col not in nlu.lower() or syn_col == '':
+                            continue
+                        phrase_tok = tokenizer.tokenize(syn_col)
+                        cidxs = self.allfindpairidx(nlu_t, phrase_tok, 1.0)
+                        typeinfos = self.add_type_all(typeinfos, ii, cidxs,
+                                                      'col', 'column', syn_col,
+                                                      column)
+
+            for ii, trie in enumerate(trie_set):
+                ans = trie.match(nlu.lower())
+                for cell in ans.keys():
+                    vphrase = cell
+                    vscore = 1.0
+                    # print("trie_set find:", cell, ans[cell])
+                    phrase_tok = tokenizer.tokenize(vphrase)
+                    if len(phrase_tok) == 0 or len(vphrase) < 2:
+                        continue
+                    vidxs = self.allfindpairidx(nlu_t, phrase_tok, vscore)
+                    linktype = self.get_column_type(ii, table)
+                    typeinfos = self.add_type_all(typeinfos, ii, vidxs, 'val',
+                                                  linktype, vphrase, ans[cell])
+
+            for number in set(numbers):
+                number_tok = tokenizer.tokenize(number.lower())
+                if len(number_tok) == 0:
+                    continue
+                nidxs = self.allfindpairidx(nlu_t, number_tok, 1.0)
+                typeinfos = self.add_type_all(typeinfos, -1, nidxs, 'val',
+                                              'real', number, number)
+
+            newtypeinfos = self.normal_type_infos(typeinfos)
+
+            newtypeinfos = self.filter_type_infos(newtypeinfos, table)
+
+            final_question = [0] * len(nlu_t)
+            final_header = [0] * len(table['header_name'])
+            for typeinfo in newtypeinfos:
+                pstart = typeinfo.pstart
+                pend = typeinfo.pend + 1
+                if typeinfo.label == 'op' or typeinfo.label == 'agg':
+                    score = int(typeinfo.linktype[-1])
+                    if typeinfo.label == 'op':
+                        score += 6
+                    else:
+                        score += 11
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = score
+
+                elif typeinfo.label == 'col':
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = 4
+                    if final_header[typeinfo.index] % 2 == 0:
+                        final_header[typeinfo.index] += 1
+
+                elif typeinfo.label == 'val':
+                    if typeinfo.index == -1:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 5
+                    else:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 2
+                        final_question[pstart] = 1
+                        final_question[pend - 1] = 3
+                        if final_header[typeinfo.index] < 2:
+                            final_header[typeinfo.index] += 2
+
+            # collect schema_link
+            schema_link = []
+            for sl in newtypeinfos:
+                if sl.label in ['val', 'col']:
+                    schema_link.append({
+                        'question_len':
+                        max(0, sl.pend - sl.pstart + 1),
+                        'question_index': [sl.pstart, sl.pend],
+                        'question_span':
+                        ''.join(nlu_t[sl.pstart:sl.pend + 1]),
+                        'column_index':
+                        sl.index,
+                        'column_span':
+                        table['header_name'][sl.index]
+                        if sl.index != -1 else '空列',
+                        'label':
+                        sl.label,
+                        'weight':
+                        round(sl.weight, 4)
+                    })
+
+            # get the match score of each table
+            match_score = self.get_table_match_score(nlu_t, schema_link)
+
+            search_result = {
+                'table_id': table['table_id'],
+                'question_knowledge': final_question,
+                'header_knowledge': final_header,
+                'schema_link': schema_link,
+                'match_score': match_score
+            }
+            search_result_list.append(search_result)
+
+        search_result_list = sorted(
+            search_result_list, key=lambda x: x['match_score'],
+            reverse=True)[0:4]
+
+        return search_result_list
diff --git a/modelscope/preprocessors/star3/fields/struct.py b/modelscope/preprocessors/star3/fields/struct.py
new file mode 100644
index 00000000..3c2e664b
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/struct.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+agg_ops = [
+    '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY', 'SAME'
+]
+conn_ops = ['', 'AND', 'OR']
+
+
+class Context:
+
+    def __init__(self):
+        self.history_sql = None
+
+    def set_history_sql(self, sql):
+        self.history_sql = sql
+
+
+class SQLQuery:
+
+    def __init__(self, string, query, sql_result):
+        self.string = string
+        self.query = query
+        self.sql_result = sql_result
+
+
+class TrieNode(object):
+
+    def __init__(self):
+        """
+        Initialize your data structure here.
+        """
+        self.data = {}
+        self.is_word = False
+        self.term = None
+
+
+class Trie(object):
+
+    def __init__(self):
+        self.root = TrieNode()
+
+    def insert(self, word, term):
+        """
+        Inserts a word into the trie.
+        :type word: str
+        :rtype: void
+        """
+        node = self.root
+        for letter in word:
+            child = node.data.get(letter)
+            if not child:
+                node.data[letter] = TrieNode()
+            node = node.data[letter]
+        node.is_word = True
+        node.term = term
+
+    def search(self, word):
+        """
+        Returns if the word is in the trie.
+        :type word: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in word:
+            node = node.data.get(letter)
+            if not node:
+                return None, False
+        return node.term, True
+
+    def match(self, query):
+        start = 0
+        end = 1
+        length = len(query)
+        ans = {}
+        while start < length and end < length:
+            sub = query[start:end]
+            term, flag = self.search(sub)
+            if flag:
+                if term is not None:
+                    ans[sub] = term
+                end += 1
+            else:
+                start += 1
+                end = start + 1
+        return ans
+
+    def starts_with(self, prefix):
+        """
+        Returns if there is any word in the trie
+        that starts with the given prefix.
+        :type prefix: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+            if not node:
+                return False
+        return True
+
+    def get_start(self, prefix):
+        """
+        Returns words started with prefix
+        :param prefix:
+        :return: words (list)
+        """
+
+        def _get_key(pre, pre_node):
+            words_list = []
+            if pre_node.is_word:
+                words_list.append(pre)
+            for x in pre_node.data.keys():
+                words_list.extend(_get_key(pre + str(x), pre_node.data.get(x)))
+            return words_list
+
+        words = []
+        if not self.starts_with(prefix):
+            return words
+        if self.search(prefix):
+            words.append(prefix)
+            return words
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+        return _get_key(prefix, node)
+
+
+class TypeInfo:
+
+    def __init__(self, label, index, linktype, value, orgvalue, pstart, pend,
+                 weight):
+        self.label = label
+        self.index = index
+        self.linktype = linktype
+        self.value = value
+        self.orgvalue = orgvalue
+        self.pstart = pstart
+        self.pend = pend
+        self.weight = weight
+
+
+class Constant:
+
+    def __init__(self):
+        self.action_ops = [
+            'add_cond', 'change_cond', 'del_cond', 'change_focus_total',
+            'change_agg_only', 'del_focus', 'restart', 'switch_table',
+            'out_of_scripts', 'repeat', 'firstTurn'
+        ]
+
+        self.agg_ops = [
+            '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY',
+            'SAME'
+        ]
+
+        self.cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+
+        self.cond_conn_ops = ['', 'AND', 'OR']
+
+        self.col_type_dict = {
+            'null': 0,
+            'text': 1,
+            'number': 2,
+            'duration': 3,
+            'bool': 4,
+            'date': 5
+        }
+
+        self.schema_link_dict = {
+            'col_start': 1,
+            'col_middle': 2,
+            'col_end': 3,
+            'val_start': 4,
+            'val_middle': 5,
+            'val_end': 6
+        }
+
+        self.max_select_num = 4
+
+        self.max_where_num = 6
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
new file mode 100644
index 00000000..163759a1
--- /dev/null
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -0,0 +1,118 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.schema_link import SchemaLinker
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+__all__ = ['TableQuestionAnsweringPreprocessor']
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.table_question_answering_preprocessor)
+class TableQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, db: Database = None, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+            db (Database): database instance
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # read tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(self.model_dir, ModelFile.VOCAB_FILE))
+
+        # read database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(self.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(self.model_dir, 'synonym.txt'))
+        else:
+            self.db = db
+
+        # get schema linker
+        self.schema_linker = SchemaLinker()
+
+        # set device
+        self.device = 'cuda' if \
+            ('device' not in kwargs or kwargs['device'] == 'gpu') \
+            and torch.cuda.is_available() else 'cpu'
+
+    def construct_data(self, search_result_list, nlu, nlu_t, db, history_sql):
+        datas = []
+        for search_result in search_result_list:
+            data = {}
+            data['table_id'] = search_result['table_id']
+            data['question'] = nlu
+            data['question_tok'] = nlu_t
+            data['header_tok'] = db.tables[data['table_id']]['header_tok']
+            data['types'] = db.tables[data['table_id']]['header_types']
+            data['units'] = db.tables[data['table_id']]['header_units']
+            data['action'] = 0
+            data['sql'] = None
+            data['history_sql'] = history_sql
+            data['wvi_corenlp'] = []
+            data['bertindex_knowledge'] = search_result['question_knowledge']
+            data['header_knowledge'] = search_result['header_knowledge']
+            data['schema_link'] = search_result['schema_link']
+            datas.append(data)
+
+        return datas
+
+    @type_assert(object, dict)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (dict):
+                utterance: a sentence
+                last_sql: predicted sql of last utterance
+                Example:
+                    utterance: 'Which of these are hiring?'
+                    last_sql: ''
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # tokenize question
+        question = data['question']
+        history_sql = data['history_sql']
+        nlu = question.lower()
+        nlu_t = self.tokenizer.tokenize(nlu)
+
+        # get linking
+        search_result_list = self.schema_linker.get_entity_linking(
+            tokenizer=self.tokenizer,
+            nlu=nlu,
+            nlu_t=nlu_t,
+            tables=self.db.tables,
+            col_syn_dict=self.db.syn_dict)
+
+        # collect data
+        datas = self.construct_data(
+            search_result_list=search_result_list[0:1],
+            nlu=nlu,
+            nlu_t=nlu_t,
+            db=self.db,
+            history_sql=history_sql)
+
+        return {'datas': datas}
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index af539dda..0b0ea61d 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -3,7 +3,8 @@ from typing import List
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline)
+                                      DialogStateTrackingPipeline,
+                                      TableQuestionAnsweringPipeline)
 
 
 def text2sql_tracking_and_print_results(
@@ -42,3 +43,17 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
+def tableqa_tracking_and_print_results(
+        test_case, pipelines: List[TableQuestionAnsweringPipeline]):
+    for pipeline in pipelines:
+        historical_queries = None
+        for question in test_case['utterance']:
+            output_dict = pipeline({
+                'question': question,
+                'history_sql': historical_queries
+            })
+            print('output_dict', output_dict['output'].string,
+                  output_dict['output'].query)
+            historical_queries = output_dict['history']
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
new file mode 100644
index 00000000..3c416cd5
--- /dev/null
+++ b/tests/pipelines/test_table_question_answering.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+from typing import List
+
+from transformers import BertTokenizer
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results
+from modelscope.utils.test_utils import test_level
+
+
+class TableQuestionAnswering(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.table_question_answering
+        self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+
+    model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+    test_case = {
+        'utterance':
+        ['长江流域的小(2)型水库的库容总量是多少？', '那平均值是多少？', '那水库的名称呢？', '换成中型的呢？']
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=cache_path, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_task(self):
+        pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub_with_other_classes(self):
+        model = Model.from_pretrained(self.model_id)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+        db = Database(
+            tokenizer=self.tokenizer,
+            table_file_path=os.path.join(model.model_dir, 'table.json'),
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'))
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir, db=db)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor, db=db)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7cb72cc46e4d0fc5b7f92ab43ae27bdfcae788d2 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Wed, 14 Sep 2022 19:24:48 +0800
Subject: [PATCH 100/175] [to #42322933]MsDataset upload bugfix for 0830
 version.

CR link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10109035#tab=changes&file=8348e8153b2f4a6dbd52e471b4980542355408ed

Please refer to aone links:

1. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44889184

2. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44858810

3. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44857728

4. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44658972
---
 modelscope/hub/api.py                   |  2 +-
 modelscope/hub/errors.py                |  2 +-
 modelscope/hub/git.py                   | 16 +++++++++------
 modelscope/hub/repository.py            | 26 ++++++++++++++++++++----
 modelscope/msdatasets/ms_dataset.py     | 27 ++++++++++++++++---------
 tests/msdatasets/test_dataset_upload.py |  1 -
 6 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 721f5637..85da6a31 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -389,7 +389,7 @@ class HubApi:
         cookies = requests.utils.dict_from_cookiejar(cookies)
         r = requests.get(url=datahub_url, cookies=cookies)
         resp = r.json()
-        datahub_raise_on_error(datahub_url, resp)
+        raise_on_error(resp)
         return resp['Data']
 
     def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index e9c008b0..284dbed4 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -60,7 +60,7 @@ def raise_on_error(rsp):
     Args:
         rsp (_type_): The server response
     """
-    if rsp['Code'] == HTTPStatus.OK and rsp['Success']:
+    if rsp['Code'] == HTTPStatus.OK:
         return True
     else:
         raise RequestError(rsp['Message'])
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 264cd59a..13e1910d 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -51,12 +51,16 @@ class GitCommandWrapper(metaclass=Singleton):
             response.check_returncode()
             return response
         except subprocess.CalledProcessError as error:
-            logger.error(
-                'There are error run git command, you may need to login first.'
-            )
-            raise GitError(
-                'stdout: %s, stderr: %s' %
-                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+            if response.returncode == 1:
+                logger.info('Nothing to commit.')
+                return response
+            else:
+                logger.error(
+                    'There are error run git command, you may need to login first.'
+                )
+                raise GitError('stdout: %s, stderr: %s' %
+                               (response.stdout.decode('utf8'),
+                                error.stderr.decode('utf8')))
 
     def config_auth_token(self, repo_dir, auth_token):
         url = self.get_repo_remote_url(repo_dir)
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 6f560f7a..8d5fd30b 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -40,6 +40,11 @@ class Repository:
         self.model_dir = model_dir
         self.model_base_dir = os.path.dirname(model_dir)
         self.model_repo_name = os.path.basename(model_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
+
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -145,10 +150,21 @@ class DatasetRepository:
                 The git command line path, if None, we use 'git'
         """
         self.dataset_id = dataset_id
-        self.repo_work_dir = repo_work_dir
-        self.repo_base_dir = os.path.dirname(repo_work_dir)
-        self.repo_name = os.path.basename(repo_work_dir)
+        if not repo_work_dir or not isinstance(repo_work_dir, str):
+            err_msg = 'dataset_work_dir must be provided!'
+            raise InvalidParameter(err_msg)
+        self.repo_work_dir = repo_work_dir.rstrip('/')
+        if not self.repo_work_dir:
+            err_msg = 'dataset_work_dir can not be root dir!'
+            raise InvalidParameter(err_msg)
+        self.repo_base_dir = os.path.dirname(self.repo_work_dir)
+        self.repo_name = os.path.basename(self.repo_work_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
         self.revision = revision
+
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -199,7 +215,9 @@ class DatasetRepository:
         self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
         self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)
 
-        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        remote_url = self._get_remote_url()
+        remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+
         self.git_wrapper.pull(self.repo_work_dir)
         self.git_wrapper.add(self.repo_work_dir, all_files=True)
         self.git_wrapper.commit(self.repo_work_dir, commit_message)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 691db4fe..a0203df9 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -220,18 +220,23 @@ class MsDataset:
         api = HubApi()
         download_dataset = ''
         if isinstance(dataset_name, str):
-            download_dataset = dataset_name
             dataset_formation = DatasetFormations.native
-            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
-                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
+            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
+                    dataset_name):
                 dataset_formation = DatasetFormations.hf_compatible
+            elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'):
+                dataset_formation = DatasetFormations.hf_compatible
+                file_name = os.path.basename(dataset_name)
+                download_dataset = os.path.splitext(file_name)[0]
             elif is_relative_path(dataset_name) and dataset_name.count(
                     '/') == 0:
+                download_dataset = dataset_name
                 dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
                     dataset_name, namespace, download_mode, version)
                 # dataset organized to be compatible with hf format
                 if dataset_formation == DatasetFormations.hf_compatible:
                     dataset_name = dataset_scripts['.py'][0]
+                    download_dataset = dataset_name
             else:
                 raise FileNotFoundError(
                     f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -268,8 +273,11 @@ class MsDataset:
                             f' {type(dataset_name)}')
 
         if download_dataset:
-            api.on_dataset_download(
-                dataset_name=download_dataset, namespace=namespace)
+            try:
+                api.on_dataset_download(
+                    dataset_name=download_dataset, namespace=namespace)
+            except Exception as e:
+                logger.error(e)
 
         return MsDataset.from_hf_dataset(dataset, target=target)
 
@@ -587,7 +595,7 @@ class MsDataset:
         """Clone meta-file of dataset from the ModelScope Hub.
         Args:
             dataset_work_dir (str): Current git working directory.
-            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
             revision(`Optional[str]`):
                 revision of the model you want to clone from. Can be any of a branch, tag or commit hash
             auth_token(`Optional[str]`):
@@ -609,11 +617,11 @@ class MsDataset:
         if clone_work_dir:
             logger.info('Already cloned repo to: {}'.format(clone_work_dir))
         else:
-            logger.warning('The repo working dir is already ex.')
+            logger.warning(
+                'Repo dir already exists: {}'.format(clone_work_dir))
 
     @staticmethod
     def upload_meta(dataset_work_dir: str,
-                    dataset_id: str,
                     commit_message: str,
                     revision: Optional[str] = DEFAULT_DATASET_REVISION,
                     auth_token: Optional[str] = None,
@@ -623,7 +631,6 @@ class MsDataset:
 
         Args:
             dataset_work_dir (str): Current working directory.
-            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
             commit_message (str): Commit message.
             revision(`Optional[str]`):
                 revision of the model you want to clone from. Can be any of a branch, tag or commit hash
@@ -640,7 +647,7 @@ class MsDataset:
         """
         _repo = DatasetRepository(
             repo_work_dir=dataset_work_dir,
-            dataset_id=dataset_id,
+            dataset_id='',
             revision=revision,
             auth_token=auth_token,
             git_path=git_path)
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 61b1c6a4..1179414d 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -87,7 +87,6 @@ class DatasetUploadTest(unittest.TestCase):
 
         MsDataset.upload_meta(
             dataset_work_dir=self.test_meta_dir,
-            dataset_id=os.path.join(self.namespace, self.dataset_name),
             commit_message='Update for unit test.')
 
 
From adee5d10aedbb046ec47b621d86846ed26b71548 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 15 Sep 2022 18:11:03 +0800
Subject: [PATCH 101/175] update EasyCV MsDataset         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10103248

    * update EasyCV MSDataset
---
 modelscope/msdatasets/cv/easycv_base.py       | 35 ++++---------------
 .../face_2d_keypoints_dataset.py              | 28 +++++++++++++--
 .../classification_dataset.py                 | 25 ++++++++++---
 .../segmentation_dataset.py                   | 27 ++------------
 .../cv/object_detection/detection_dataset.py  | 25 +++----------
 modelscope/msdatasets/ms_dataset.py           |  4 +--
 setup.cfg                                     |  4 +--
 tests/trainers/easycv/test_segformer.py       |  3 +-
 8 files changed, 65 insertions(+), 86 deletions(-)

diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/cv/easycv_base.py
index 92b77389..a45827a3 100644
--- a/modelscope/msdatasets/cv/easycv_base.py
+++ b/modelscope/msdatasets/cv/easycv_base.py
@@ -1,26 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
 import os.path as osp
 
 
 class EasyCVBaseDataset(object):
     """Adapt to MSDataset.
-    Subclasses need to implement ``DATA_STRUCTURE``, the format is as follows, e.g.:
-
-    {
-        '${data source name}': {
-            'train':{
-                '${image root arg}': 'images',  # directory name of images relative to the root path
-                '${label root arg}': 'labels',  # directory name of lables relative to the root path
-                ...
-            },
-            'validation': {
-                '${image root arg}': 'images',
-                '${label root arg}': 'labels',
-                ...
-            }
-        }
-    }
 
     Args:
         split_config (dict): Dataset root path from MSDataset, e.g.
@@ -29,7 +12,7 @@ class EasyCVBaseDataset(object):
             the model if supplied. Not support yet.
         mode: Training or Evaluation.
     """
-    DATA_STRUCTURE = None
+    DATA_ROOT_PATTERN = '${data_root}'
 
     def __init__(self,
                  split_config=None,
@@ -45,15 +28,9 @@ class EasyCVBaseDataset(object):
 
     def _update_data_source(self, data_source):
         data_root = next(iter(self.split_config.values()))
-        split = next(iter(self.split_config.keys()))
+        data_root = data_root.rstrip(osp.sep)
 
-        # TODO: msdataset should support these keys to be configured in the dataset's json file and passed in
-        if data_source['type'] not in list(self.DATA_STRUCTURE.keys()):
-            raise ValueError(
-                'Only support %s now, but get %s.' %
-                (list(self.DATA_STRUCTURE.keys()), data_source['type']))
-
-        # join data root path of msdataset and default relative name
-        update_args = self.DATA_STRUCTURE[data_source['type']][split]
-        for k, v in update_args.items():
-            data_source.update({k: osp.join(data_root, v)})
+        for k, v in data_source.items():
+            if isinstance(v, str) and self.DATA_ROOT_PATTERN in v:
+                data_source.update(
+                    {k: v.replace(self.DATA_ROOT_PATTERN, data_root)})
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
index a902999d..2f2e03ef 100644
--- a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -2,6 +2,7 @@
 from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
@@ -9,5 +10,28 @@ from modelscope.utils.constant import Tasks
 @TASK_DATASETS.register_module(
     group_key=Tasks.face_2d_keypoints,
     module_name=Datasets.Face2dKeypointsDataset)
-class FaceKeypointDataset(_FaceKeypointDataset):
-    """EasyCV dataset for face 2d keypoints."""
+class FaceKeypointDataset(EasyCVBaseDataset, _FaceKeypointDataset):
+    """EasyCV dataset for face 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _FaceKeypointDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
index c7145f2b..ba73e472 100644
--- a/modelscope/msdatasets/cv/image_classification/classification_dataset.py
+++ b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
@@ -2,6 +2,7 @@
 from easycv.datasets.classification import ClsDataset as _ClsDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
@@ -10,10 +11,26 @@ from modelscope.utils.constant import Tasks
     group_key=Tasks.image_classification, module_name=Datasets.ClsDataset)
 class ClsDataset(_ClsDataset):
     """EasyCV dataset for classification.
-    For more details, please refer to :
-    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/classification/raw.py .
 
     Args:
-        data_source: Data source config to parse input data.
-        pipeline: Sequence of transform object or config dict to be composed.
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _ClsDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
index c53e1431..b1316e2e 100644
--- a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -1,6 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-
 from easycv.datasets.segmentation import SegDataset as _SegDataset
 
 from modelscope.metainfo import Datasets
@@ -9,30 +7,9 @@ from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
-class EasyCVSegBaseDataset(EasyCVBaseDataset):
-    DATA_STRUCTURE = {
-        # data source name
-        'SegSourceRaw': {
-            'train': {
-                'img_root':
-                'images',  # directory name of images relative to the root path
-                'label_root':
-                'annotations',  # directory name of annotation relative to the root path
-                'split':
-                'train.txt'  # split file name relative to the root path
-            },
-            'validation': {
-                'img_root': 'images',
-                'label_root': 'annotations',
-                'split': 'val.txt'
-            }
-        }
-    }
-
-
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
-class SegDataset(EasyCVSegBaseDataset, _SegDataset):
+class SegDataset(EasyCVBaseDataset, _SegDataset):
     """EasyCV dataset for Sementic segmentation.
     For more details, please refer to :
     https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
@@ -55,7 +32,7 @@ class SegDataset(EasyCVSegBaseDataset, _SegDataset):
                  mode=None,
                  *args,
                  **kwargs) -> None:
-        EasyCVSegBaseDataset.__init__(
+        EasyCVBaseDataset.__init__(
             self,
             split_config=split_config,
             preprocessor=preprocessor,
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
index e3aaaa92..2f6ad7d3 100644
--- a/modelscope/msdatasets/cv/object_detection/detection_dataset.py
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -11,26 +11,9 @@ from modelscope.msdatasets.task_datasets import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
-class EasyCVDetBaseDataset(EasyCVBaseDataset):
-    DATA_STRUCTURE = {
-        'DetSourceCoco': {
-            'train': {
-                'ann_file':
-                'train.json',  # file name of annotation relative to the root path
-                'img_prefix':
-                'images',  # directory name of images relative to the root path
-            },
-            'validation': {
-                'ann_file': 'val.json',
-                'img_prefix': 'images',
-            }
-        }
-    }
-
-
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
-class DetDataset(EasyCVDetBaseDataset, _DetDataset):
+class DetDataset(EasyCVBaseDataset, _DetDataset):
     """EasyCV dataset for object detection.
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
 
@@ -52,7 +35,7 @@ class DetDataset(EasyCVDetBaseDataset, _DetDataset):
                  mode=None,
                  *args,
                  **kwargs) -> None:
-        EasyCVDetBaseDataset.__init__(
+        EasyCVBaseDataset.__init__(
             self,
             split_config=split_config,
             preprocessor=preprocessor,
@@ -65,7 +48,7 @@ class DetDataset(EasyCVDetBaseDataset, _DetDataset):
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection,
     module_name=Datasets.DetImagesMixDataset)
-class DetImagesMixDataset(EasyCVDetBaseDataset, _DetImagesMixDataset):
+class DetImagesMixDataset(EasyCVBaseDataset, _DetImagesMixDataset):
     """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
     Suitable for training on multiple images mixed data augmentation like
     mosaic and mixup. For the augmentation pipeline of mixed image data,
@@ -99,7 +82,7 @@ class DetImagesMixDataset(EasyCVDetBaseDataset, _DetImagesMixDataset):
                  mode=None,
                  *args,
                  **kwargs) -> None:
-        EasyCVDetBaseDataset.__init__(
+        EasyCVBaseDataset.__init__(
             self,
             split_config=split_config,
             preprocessor=preprocessor,
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index a0203df9..58957234 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -403,8 +403,8 @@ class MsDataset:
             )
         if isinstance(self._hf_ds, ExternalDataset):
             task_data_config.update({'preprocessor': preprocessors})
-            return build_task_dataset(task_data_config, task_name,
-                                      self._hf_ds.config_kwargs)
+            task_data_config.update(self._hf_ds.config_kwargs)
+            return build_task_dataset(task_data_config, task_name)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
                 preprocessors, columns=columns)
diff --git a/setup.cfg b/setup.cfg
index c98dbe05..3dc64f86 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,7 +19,7 @@ quiet-level = 3
 ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 
 [flake8]
-select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
-ignore = F401,F405,F821,W503
+select = B,C,E,F,P,T4,W,B9
+ignore = F401,F405,F821,W503,E251
 exclude = docs/src,*.pyi,.git
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
index 08da6e41..ce2e1d36 100644
--- a/tests/trainers/easycv/test_segformer.py
+++ b/tests/trainers/easycv/test_segformer.py
@@ -47,7 +47,8 @@ class EasyCVTrainerTestSegformer(unittest.TestCase):
             namespace='EasyCV',
             split='validation')
         kwargs = dict(
-            model='EasyCV/EasyCV-Segformer-b0',
+            model=
+            'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k',
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             work_dir=self.tmp_dir,

From b0b711b39c7d26cbee06cface6862ad2fc407242 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Thu, 15 Sep 2022 19:28:39 +0800
Subject: [PATCH 102/175] [to #44964129]fix: ci result always pass

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10140476
---
 tests/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/run.py b/tests/run.py
index 18839622..b286ecb5 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -56,12 +56,12 @@ def statistics_test_result(df):
     if failures_cases > 0 or \
        error_cases > 0 or \
        unexpected_success_cases > 0:
-        result = 'FAILED'
+        final_result = 'FAILED'
     else:
-        result = 'SUCCESS'
+        final_result = 'SUCCESS'
     result_msg = '%s (Runs=%s,success=%s,failures=%s,errors=%s,\
     skipped=%s,expected failures=%s,unexpected successes=%s)' % (
-        result, total_cases, success_cases, failures_cases, error_cases,
+        final_result, total_cases, success_cases, failures_cases, error_cases,
         skipped_cases, expected_failure_cases, unexpected_success_cases)
 
     model_cases = get_case_model_info()
@@ -83,7 +83,7 @@ def statistics_test_result(df):
             commit_model_ut_result(model_name, result)
     print('Testing result summary.')
     print(result_msg)
-    if result == 'FAILED':
+    if final_result == 'FAILED':
         sys.exit(1)
 
 
From 7fb25d7bbbce6eba1bfd2e23204ee5dd63eeac74 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 16 Sep 2022 22:42:39 +0800
Subject: [PATCH 103/175] [to #42322933]fix  UT error for 830 version        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10142442

---
 .../models/nlp/star3/configuration_star3.py   | 13 ------
 modelscope/models/nlp/star3/modeling_star3.py | 44 +++++--------------
 .../models/nlp/table_question_answering.py    |  4 +-
 .../task_models/sequence_classification.py    |  2 +-
 .../pipelines/nlp/fill_mask_pipeline.py       |  2 +-
 .../pipelines/nlp/fill_mask_ponet_pipeline.py |  2 +-
 .../sequence_classification_pipeline_base.py  |  2 +-
 .../nlp/table_question_answering_pipeline.py  |  5 +--
 .../nlp/zero_shot_classification_pipeline.py  |  2 +-
 modelscope/trainers/easycv/__init__.py        | 19 ++++++++
 .../nlp/space/dialog_intent_trainer.py        |  4 +-
 modelscope/trainers/trainer.py                |  2 +-
 modelscope/utils/ast_utils.py                 |  3 ++
 modelscope/utils/nlp/nlp_utils.py             |  1 -
 .../test_table_question_answering.py          |  1 -
 tests/run_config.yaml                         |  3 ++
 tests/trainers/easycv/test_segformer.py       |  8 ++--
 17 files changed, 51 insertions(+), 66 deletions(-)

diff --git a/modelscope/models/nlp/star3/configuration_star3.py b/modelscope/models/nlp/star3/configuration_star3.py
index d49c70c9..4c5ae677 100644
--- a/modelscope/models/nlp/star3/configuration_star3.py
+++ b/modelscope/models/nlp/star3/configuration_star3.py
@@ -18,21 +18,8 @@
 from __future__ import absolute_import, division, print_function
 import copy
 import logging
-import math
-import os
-import shutil
-import tarfile
-import tempfile
-from pathlib import Path
-from typing import Union
 
 import json
-import numpy as np
-import torch
-import torch_scatter
-from icecream import ic
-from torch import nn
-from torch.nn import CrossEntropyLoss
 
 logger = logging.getLogger(__name__)
 
diff --git a/modelscope/models/nlp/star3/modeling_star3.py b/modelscope/models/nlp/star3/modeling_star3.py
index ed5ea1b3..13f7136a 100644
--- a/modelscope/models/nlp/star3/modeling_star3.py
+++ b/modelscope/models/nlp/star3/modeling_star3.py
@@ -17,21 +17,15 @@
 
 from __future__ import absolute_import, division, print_function
 import copy
-import logging
 import math
 import os
 import shutil
 import tarfile
 import tempfile
-from pathlib import Path
-from typing import Union
 
-import json
 import numpy as np
 import torch
-import torch_scatter
 from torch import nn
-from torch.nn import CrossEntropyLoss
 
 from modelscope.models.nlp.star3.configuration_star3 import Star3Config
 from modelscope.utils.constant import ModelFile
@@ -121,33 +115,17 @@ class BertEmbeddings(nn.Module):
         words_embeddings = self.word_embeddings(input_ids)
         header_embeddings = self.word_embeddings(header_ids)
 
-        # header mean pooling
-        header_flatten_embeddings = self.word_embeddings(header_flatten_tokens)
-        header_flatten_index = header_flatten_index.reshape(
-            (-1, header_flatten_index.shape[1], 1))
-        header_flatten_index = header_flatten_index.repeat(
-            1, 1, header_flatten_embeddings.shape[2])
-        header_flatten_output = header_flatten_output.reshape(
-            (-1, header_flatten_output.shape[1], 1))
-        header_flatten_output = header_flatten_output.repeat(
-            1, 1, header_flatten_embeddings.shape[2])
-        header_embeddings = torch_scatter.scatter_mean(
-            header_flatten_embeddings,
-            header_flatten_index,
-            out=header_flatten_output,
-            dim=1)
-        token_column_id = token_column_id.reshape(
-            (-1, token_column_id.shape[1], 1))
-        token_column_id = token_column_id.repeat(
-            (1, 1, header_embeddings.shape[2]))
-        token_column_mask = token_column_mask.reshape(
-            (-1, token_column_mask.shape[1], 1))
-        token_column_mask = token_column_mask.repeat(
-            (1, 1, header_embeddings.shape[2]))
-        token_header_embeddings = torch.gather(header_embeddings, 1,
-                                               token_column_id)
-        words_embeddings = words_embeddings * (1.0 - token_column_mask) + \
-            token_header_embeddings * token_column_mask
+        if col_dict_list is not None and l_hs is not None:
+            col_dict_list = np.array(col_dict_list)[ids.cpu().numpy()].tolist()
+            header_len = np.array(
+                header_len, dtype=object)[ids.cpu().numpy()].tolist()
+            for bi, col_dict in enumerate(col_dict_list):
+                for ki, vi in col_dict.items():
+                    length = header_len[bi][vi]
+                    if length == 0:
+                        continue
+                    words_embeddings[bi, ki, :] = torch.mean(
+                        header_embeddings[bi, vi, :length, :], dim=0)
 
         position_embeddings = self.position_embeddings(position_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index 19fdf178..3c91a518 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -1,11 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import Dict, Optional
+from typing import Dict
 
 import numpy
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from transformers import BertTokenizer
 
@@ -15,7 +14,6 @@ from modelscope.models.builder import MODELS
 from modelscope.models.nlp.star3.configuration_star3 import Star3Config
 from modelscope.models.nlp.star3.modeling_star3 import Seq2SQL, Star3Model
 from modelscope.preprocessors.star3.fields.struct import Constant
-from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import verify_device
 
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 988f2917..80bfd476 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -48,7 +48,7 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         self.build_backbone(backbone_cfg)
         self.build_head(head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(pooled_output)
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index caba4122..db6b61c6 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -101,7 +101,7 @@ class FillMaskPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
index 0bb72430..9770fc38 100644
--- a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
@@ -97,7 +97,7 @@ class FillMaskPonetPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
index 25d68993..28bbc732 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -35,7 +35,7 @@ class SequenceClassificationPipelineBase(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 8235a4d6..96bfbc34 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -2,7 +2,6 @@
 import os
 from typing import Any, Dict, Union
 
-import torch
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Pipelines
@@ -88,7 +87,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
             return current_sql
 
         elif action == 'del_focus':
-            pre_final_sql = copy.deepcopy(history_sql)
+            pre_final_sql = history_sql
             pre_sels = []
             pre_aggs = []
             for idx, seli in enumerate(pre_final_sql['sel']):
@@ -151,7 +150,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
             return pre_final_sql
 
         elif action == 'del_cond':
-            pre_final_sql = copy.deepcopy(history_sql)
+            pre_final_sql = history_sql
 
             final_conds = []
 
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index e39cb0e1..38c0ee77 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -85,7 +85,7 @@ class ZeroShotClassificationPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
diff --git a/modelscope/trainers/easycv/__init__.py b/modelscope/trainers/easycv/__init__.py
index e69de29b..b1b8fc15 100644
--- a/modelscope/trainers/easycv/__init__.py
+++ b/modelscope/trainers/easycv/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import AddLrLogHook, EasyCVMetric
+else:
+    _import_structure = {'utils': ['AddLrLogHook', 'EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
index 515cd46d..c559ee5b 100644
--- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -5,7 +5,7 @@ from typing import Callable, Dict, Optional, Tuple, Union
 import numpy as np
 
 from modelscope.metainfo import Trainers
-from modelscope.models.nlp.space.model.generator import Generator
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
 from modelscope.models.nlp.space.model.model_base import SpaceModelBase
 from modelscope.preprocessors.space.data_loader import \
     get_sequential_data_loader
@@ -90,7 +90,7 @@ class DialogIntentTrainer(BaseTrainer):
             data_type='test')
 
         # set generator
-        generator = Generator.create(self.cfg, reader=bpe)
+        generator = SpaceGenerator.create(self.cfg, reader=bpe)
         # construct model
         self.model = SpaceModelBase.create(
             self.cfg.Model.init_checkpoint,
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 8dc75a65..d771d9d6 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -542,7 +542,7 @@ class EpochBasedTrainer(BaseTrainer):
                 value = train_outputs.get(key, None)
                 if value is not None:
                     if dist.is_available() and dist.is_initialized():
-                        value = value.data.clone()
+                        value = value.data.clone().to('cuda')
                         dist.all_reduce(value.div_(dist.get_world_size()))
                     log_vars.update({key: value.item()})
             self.log_buffer.update(log_vars)
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 62c31397..cdafafd3 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -293,6 +293,9 @@ class AstScaning(object):
                     if type(attribute_node).__name__ == 'Str':
                         result.append((getattr(node,
                                                'arg'), attribute_node.s, None))
+                    elif type(attribute_node).__name__ == 'Constant':
+                        result.append(
+                            (getattr(node, 'arg'), attribute_node.value, None))
                     else:
                         result.append((getattr(node, 'arg'), )
                                       + _get_attribute_item(attribute_node))
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 0b0ea61d..eba12103 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -1,4 +1,3 @@
-import os.path as osp
 from typing import List
 
 from modelscope.outputs import OutputKeys
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 3c416cd5..7ea28725 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import unittest
-from typing import List
 
 from transformers import BertTokenizer
 
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index f44053f6..fc983023 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -6,6 +6,9 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_video_summarization.py
   - test_dialog_modeling.py
   - test_csanmt_translation.py
+  - test_image_super_resolution.py
+  - test_easycv_trainer.py
+  - test_segformer.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
index ce2e1d36..90a66635 100644
--- a/tests/trainers/easycv/test_segformer.py
+++ b/tests/trainers/easycv/test_segformer.py
@@ -31,11 +31,11 @@ class EasyCVTrainerTestSegformer(unittest.TestCase):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
     def _train(self):
-        # adapt to distributed mode
-        from easycv.utils.test_util import pseudo_dist_init
-        pseudo_dist_init()
 
-        cfg_options = {'train.max_epochs': 2}
+        cfg_options = {
+            'train.max_epochs': 2,
+            'model.decode_head.norm_cfg.type': 'BN'
+        }
 
         trainer_name = Trainers.easycv
         train_dataset = MsDataset.load(

From 2223b9f16a7d88714ce415cc20ec384bec64afc3 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Mon, 19 Sep 2022 09:44:19 +0800
Subject: [PATCH 104/175] [to #42322933] format body 2d keypoint output boxes

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10158585
---
 modelscope/outputs.py                                 | 6 +++---
 modelscope/pipelines/cv/body_2d_keypoints_pipeline.py | 5 ++++-
 modelscope/utils/cv/image_utils.py                    | 4 ++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index d7d619bf..b3eb9ad8 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -202,9 +202,9 @@ TASK_OUTPUTS = {
     #               [[score]*15]
     #              ]
     #   "boxes": [
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
     #             ]
     # }
     Tasks.body_2d_keypoints:
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index f9ae4b2c..c6a05195 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -76,8 +76,11 @@ class Body2DKeypointsPipeline(Pipeline):
             }
 
         poses, scores, boxes = self.keypoint_model.postprocess(input)
+        result_boxes = []
+        for box in boxes:
+            result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]])
         return {
-            OutputKeys.BOXES: boxes,
+            OutputKeys.BOXES: result_boxes,
             OutputKeys.POSES: poses,
             OutputKeys.SCORES: scores
         }
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 6175a53f..9ec2c4f3 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -66,8 +66,8 @@ def draw_joints(image, np_kps, score, threshold=0.2):
 
 
 def draw_box(image, box):
-    cv2.rectangle(image, (int(box[0][0]), int(box[0][1])),
-                  (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
+    cv2.rectangle(image, (int(box[0]), int(box[1])),
+                  (int(box[2]), int(box[3])), (0, 0, 255), 2)
 
 
 def realtime_object_detection_bbox_vis(image, bboxes):

From a12844c89f63a29d4b26ec7cfe27cc66af60e799 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 19 Sep 2022 10:06:47 +0800
Subject: [PATCH 105/175] [to #44902165] bump version to 0.4.1

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index abeeedbf..f0ede3d3 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.4.1'

From 0548d92de86b7094793a3a9e1aa862db84b8324e Mon Sep 17 00:00:00 2001
From: "lingcai.wl" <lingcai.wl@alibaba-inc.com>
Date: Mon, 19 Sep 2022 10:27:26 +0800
Subject: [PATCH 106/175] [to #44657982]  fix some demo problems         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10083156

---
 modelscope/models/multi_modal/ofa_for_all_tasks.py   | 4 ++--
 modelscope/pipelines/audio/linear_aec_pipeline.py    | 2 +-
 modelscope/preprocessors/multi_modal.py              | 3 ++-
 modelscope/utils/demo_utils.py                       | 2 +-
 tests/pipelines/test_automatic_speech_recognition.py | 3 +++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 860b68d3..05950378 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -152,8 +152,8 @@ class OfaForAllTasks(TorchModel):
         region_tensor[:, ::2] /= input['w_resize_ratios']
         region_tensor[:, 1::2] /= input['h_resize_ratios']
         return {
-            OutputKeys.BOXES: move_to_device(region_tensor,
-                                             torch.device('cpu')),
+            OutputKeys.BOXES:
+            move_to_device(region_tensor, torch.device('cpu')).tolist(),
             OutputKeys.SCORES: [1.0] * region_tensor.shape[0]
         }
 
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index b59bc475..0e73b697 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -51,7 +51,7 @@ class LinearAECPipeline(Pipeline):
 
     When invoke the class with pipeline.__call__(), you should provide two params:
         Dict[str, Any]
-            the path of wav files，eg:{
+            the path of wav files, eg:{
             "nearend_mic": "/your/data/near_end_mic_audio.wav",
             "farend_speech": "/your/data/far_end_speech_audio.wav"}
         output_path (str, optional): "/your/output/audio_after_aec.wav"
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 9873a62c..342ba6b5 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -8,6 +8,7 @@ from PIL import Image
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
+from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .base import Preprocessor
@@ -137,7 +138,7 @@ class MPlugPreprocessor(Preprocessor):
     def image_open(self, path: str) -> Tuple[Image.Image, int]:
         if path not in self._image_map:
             index = len(self._image_map)
-            self._image_map[path] = (Image.open(path), index)
+            self._image_map[path] = (load_image(path), index)
         return self._image_map[path]
 
     def __call__(
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 0f8378cd..93535c1e 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -236,7 +236,7 @@ def postprocess(req, resp):
             _, img_encode = cv2.imencode('.' + file_type, content)
             img_bytes = img_encode.tobytes()
             return type(img_bytes)
-        elif file_type == 'wav':
+        else:
             out_mem_file = io.BytesIO()
             out_mem_file.write(new_resp.get(output_key))
             return type(out_mem_file)
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index e475c3cd..303fb6b9 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -22,6 +22,9 @@ URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audi
 LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz'
 LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz'
 
+TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz'
+TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz'
+
 
 class AutomaticSpeechRecognitionTest(unittest.TestCase,
                                      DemoCompatibilityCheck):

From 0041ab0ab8928a362b3bcc293fd6289dd618d29a Mon Sep 17 00:00:00 2001
From: myf272609 <myf272609@alibaba-inc.com>
Date: Mon, 19 Sep 2022 11:28:01 +0800
Subject: [PATCH 107/175] [to #42322933] add multi-style cartoon models to ut
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 卡通化接入多风格模型（原始日漫风、3D、手绘风、素描风、艺术特效风格），添加ut接入测试
2. 修改pipeline中模型文件名称至通用名
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10153717
---
 .../pipelines/cv/image_cartoon_pipeline.py    |  6 ++--
 tests/pipelines/test_person_image_cartoon.py  | 28 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index eb669354..f34be618 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -40,11 +40,9 @@ class ImageCartoonPipeline(Pipeline):
         with device_placement(self.framework, self.device_name):
             self.facer = FaceAna(self.model)
             self.sess_anime_head = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_h.pb'),
-                'model_anime_head')
+                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
             self.sess_anime_bg = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_bg.pb'),
-                'model_anime_bg')
+                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index 5c81cd28..b8549f4f 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -16,6 +16,10 @@ class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_person-image-cartoon_compound-models'
+        self.model_id_3d = 'damo/cv_unet_person-image-cartoon-3d_compound-models'
+        self.model_id_handdrawn = 'damo/cv_unet_person-image-cartoon-handdrawn_compound-models'
+        self.model_id_sketch = 'damo/cv_unet_person-image-cartoon-sketch_compound-models'
+        self.model_id_artstyle = 'damo/cv_unet_person-image-cartoon-artstyle_compound-models'
         self.task = Tasks.image_portrait_stylization
         self.test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
 
@@ -31,6 +35,30 @@ class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_portrait_stylization, model=self.model_id)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_3d(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_3d)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_handdrawn(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_handdrawn)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_sketch(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_sketch)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_artstyle(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_artstyle)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)

From 4cdd0c23eb589d05ad9c53ffda33865b1e3bbb0b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 19 Sep 2022 17:05:35 +0800
Subject: [PATCH 108/175] [to #42322933] Refactor and fix some bugs

1. Fix a bug in trainer's progress bar
2. Fix a bug that trainer does not support dataset in config file
3. Add feature: support go on training via checkpoint file
4. Add feature: support fixed filename when saving best checkpoint
5. Fix a bug that no id2label in config file after finetune of nlp models
6. Fix some other bugs
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10138906
---
 .../metrics/sequence_classification_metric.py |   4 +-
 modelscope/trainers/hooks/checkpoint_hook.py  | 143 ++++++++++++++++--
 modelscope/trainers/hooks/hook.py             |   6 +
 modelscope/trainers/hooks/optimizer/base.py   |   3 +-
 .../trainers/lrscheduler/warmup/base.py       |   4 +-
 modelscope/trainers/nlp_trainer.py            |  64 +++++---
 modelscope/trainers/trainer.py                |  43 ++++--
 modelscope/utils/checkpoint.py                |  75 +++++++--
 modelscope/utils/regress_test_utils.py        |  21 ++-
 modelscope/utils/tensor_utils.py              |   3 -
 .../data/test/regression/sbert-base-tnews.bin |   3 -
 tests/trainers/test_trainer_with_nlp.py       |  87 ++++++++++-
 12 files changed, 374 insertions(+), 82 deletions(-)
 delete mode 100644 tests/trainers/data/test/regression/sbert-base-tnews.bin

diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index 83cb39ca..d795d8a2 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -14,9 +14,9 @@ from .builder import METRICS, MetricKeys
 @METRICS.register_module(
     group_key=default_group, module_name=Metrics.seq_cls_metric)
 class SequenceClassificationMetric(Metric):
-    """The metric computation class for sequence classification classes.
+    """The metric computation class for sequence classification tasks.
 
-    This metric class calculates accuracy for the whole input batches.
+    This metric class calculates accuracy of the whole input batches.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index fcd8e982..a9b793d4 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,14 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import random
 
-import json
+import numpy as np
+import torch
 
 from modelscope import __version__
 from modelscope.metainfo import Hooks
-from modelscope.utils.checkpoint import save_checkpoint
+from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import is_master
+from modelscope.utils.torch_utils import get_dist_info, is_master
 from .builder import HOOKS
 from .hook import Hook
 from .priority import Priority
@@ -25,6 +27,7 @@ class CheckpointHook(Hook):
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
         save_last (bool): Whether to save the last checkpoint. Default: True.
+        checkpoint_file (str): The checkpoint file to be loaded.
     """
 
     PRIORITY = Priority.LOW
@@ -34,12 +37,16 @@ class CheckpointHook(Hook):
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
-                 save_last=True):
+                 save_last=True,
+                 checkpoint_file=None):
         self.interval = interval
         self.by_epoch = by_epoch
         self.save_optimizer = save_optimizer
         self.save_dir = save_dir
+        self.checkpoint_file = checkpoint_file
         self.save_last = save_last
+        self.rng_state = None
+        self.need_load_rng_state = False
 
     def before_run(self, trainer):
         if not self.save_dir:
@@ -56,6 +63,34 @@ class CheckpointHook(Hook):
         if is_master():
             self.logger.info(f'Checkpoints will be saved to {self.save_dir}')
 
+        if self.checkpoint_file is not None and os.path.isfile(
+                self.checkpoint_file):
+            meta = self.load_checkpoint(self.checkpoint_file, trainer)
+            self.rng_state = meta.get('rng_state')
+            self.need_load_rng_state = True
+
+    def before_train_epoch(self, trainer):
+        if self.need_load_rng_state:
+            if self.rng_state is not None:
+                random.setstate(self.rng_state['random'])
+                np.random.set_state(self.rng_state['numpy'])
+                torch.random.set_rng_state(self.rng_state['cpu'])
+                if torch.cuda.is_available():
+                    torch.cuda.random.set_rng_state_all(self.rng_state['cuda'])
+                self.need_load_rng_state = False
+            else:
+                self.logger.warn(
+                    'Random state cannot be found in checkpoint file, '
+                    'this may cause a random data order or model initialization.'
+                )
+
+        self.rng_state = {
+            'random': random.getstate(),
+            'numpy': np.random.get_state(),
+            'cpu': torch.random.get_rng_state(),
+            'cuda': torch.cuda.get_rng_state_all(),
+        }
+
     def after_train_epoch(self, trainer):
         if not self.by_epoch:
             return
@@ -66,6 +101,39 @@ class CheckpointHook(Hook):
                     f'Saving checkpoint at {trainer.epoch + 1} epoch')
                 self._save_checkpoint(trainer)
 
+    @classmethod
+    def load_checkpoint(cls, filename, trainer):
+        from modelscope.trainers.parallel.utils import is_parallel
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+        meta = load_checkpoint(filename, model, trainer.optimizer,
+                               trainer.lr_scheduler)
+        trainer._epoch = meta.get('epoch', trainer._epoch)
+        trainer._iter = meta.get('iter', trainer._iter)
+        trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
+
+        for i, hook in enumerate(trainer.hooks):
+            # hook: Hook
+            key = f'{hook.__class__}-{i}'
+            if key in meta:
+                hook.load_state_dict(meta[key])
+            else:
+                trainer.logger(
+                    f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
+                )
+
+        version = meta.get('modelscope')
+        if version != __version__:
+            trainer.logger(
+                f'The modelscope version of loaded checkpoint does not match the runtime version. '
+                f'The saved version: {version}, runtime version: {__version__}'
+            )
+        trainer.logger(
+            f'Checkpoint {filename} saving time: {meta.get("time")}')
+        return meta
+
     def _save_checkpoint(self, trainer):
         if self.by_epoch:
             cur_save_name = os.path.join(
@@ -74,7 +142,21 @@ class CheckpointHook(Hook):
             cur_save_name = os.path.join(
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        save_checkpoint(
+            trainer.model,
+            cur_save_name,
+            trainer.optimizer,
+            trainer.lr_scheduler,
+            meta=meta)
         if (self.is_last_epoch(trainer)
                 and self.by_epoch) or (self.is_last_iter(trainer)
                                        and not self.by_epoch):
@@ -144,6 +226,7 @@ class BestCkptSaverHook(CheckpointHook):
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
+                 save_file_name=None,
                  interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
@@ -179,16 +262,44 @@ class BestCkptSaverHook(CheckpointHook):
         return False
 
     def _save_checkpoint(self, trainer):
-        if self.by_epoch:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        else:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        cur_save_name = self.save_file_name
+        if cur_save_name is None:
+            if self.by_epoch:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+            else:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        if os.path.isfile(cur_save_name):
+            os.remove(cur_save_name)
+        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer,
+                        trainer.lr_scheduler, meta)
         self._best_ckpt_file = cur_save_name
         self._save_pretrained(trainer)
+
+    def state_dict(self):
+        return {
+            'best_metric': self._best_metric,
+        }
+
+    def load_state_dict(self, state_dict):
+        if state_dict is not None and len(state_dict) > 0:
+            self._best_metric = state_dict.get('best_metric')
+        else:
+            self.logger.warn(
+                'The state_dict is not available, the best metric value will be affected.'
+            )
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 1c567f1c..d3805be8 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -215,3 +215,9 @@ class Hook:
                 trigger_stages.add(stage)
 
         return [stage for stage in Hook.stages if stage in trigger_stages]
+
+    def state_dict(self):
+        return {}
+
+    def load_state_dict(self, state_dict):
+        pass
diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py
index dffad6ea..8c61dfdb 100644
--- a/modelscope/trainers/hooks/optimizer/base.py
+++ b/modelscope/trainers/hooks/optimizer/base.py
@@ -4,6 +4,7 @@ import logging
 from torch.nn.utils import clip_grad
 
 from modelscope.metainfo import Hooks
+from modelscope.outputs import OutputKeys
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.hook import Hook
 from modelscope.trainers.hooks.priority import Priority
@@ -27,7 +28,7 @@ class OptimizerHook(Hook):
     def __init__(self,
                  cumulative_iters=1,
                  grad_clip=None,
-                 loss_keys='loss') -> None:
+                 loss_keys=OutputKeys.LOSS) -> None:
         if isinstance(loss_keys, str):
             loss_keys = [loss_keys]
         assert isinstance(loss_keys, (tuple, list))
diff --git a/modelscope/trainers/lrscheduler/warmup/base.py b/modelscope/trainers/lrscheduler/warmup/base.py
index 81497817..4b066281 100644
--- a/modelscope/trainers/lrscheduler/warmup/base.py
+++ b/modelscope/trainers/lrscheduler/warmup/base.py
@@ -28,10 +28,10 @@ class BaseWarmup(_LRScheduler):
         return self.base_scheduler.get_lr()
 
     def state_dict(self):
-        self.base_scheduler.state_dict()
+        return self.base_scheduler.state_dict()
 
     def load_state_dict(self, state_dict):
-        self.base_scheduler.load_state_dict(state_dict)
+        return self.base_scheduler.load_state_dict(state_dict)
 
     def scale(self):
         """Scale the learning rates.
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 3692b486..4a14be31 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,6 +1,7 @@
 import os
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import Dataset
@@ -11,9 +12,10 @@ from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import Preprocessor, build_preprocessor
-from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.config import Config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
                                        ModelFile, Tasks)
+from modelscope.utils.hub import parse_label_mapping
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
@@ -81,19 +83,32 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
             model_dir = os.path.dirname(cfg_file)
 
+        self.label2id = None
+        self.id2label = None
+        self.num_labels = None
         self.cfg_modify_fn = cfg_modify_fn
         self.cfg = self.rebuild_config(Config.from_file(cfg_file))
-        try:
-            labels = self.cfg.dataset.train.labels
-        except AttributeError:
-            labels = None
 
-        self.label2id = None
-        self.num_labels = None
-        if labels is not None and len(labels) > 0:
-            self.label2id = {label: idx for idx, label in enumerate(labels)}
-            self.id2label = {idx: label for idx, label in enumerate(labels)}
-            self.num_labels = len(labels)
+        label2id = parse_label_mapping(model_dir)
+        if label2id is not None:
+            self.label2id = label2id
+            self.id2label = {id: label for label, id in label2id.items()}
+            self.num_labels = len(label2id)
+        else:
+            try:
+                labels = self.cfg.dataset.train.labels
+                if labels is not None and len(labels) > 0:
+                    self.label2id = {
+                        label: idx
+                        for idx, label in enumerate(labels)
+                    }
+                    self.id2label = {
+                        idx: label
+                        for idx, label in enumerate(labels)
+                    }
+                    self.num_labels = len(labels)
+            except AttributeError:
+                pass
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -130,7 +145,13 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
 
     def rebuild_config(self, cfg: Config):
         if self.cfg_modify_fn is not None:
-            return self.cfg_modify_fn(cfg)
+            cfg = self.cfg_modify_fn(cfg)
+        if not hasattr(cfg.model, 'label2id') and not hasattr(
+                cfg.model, 'id2label'):
+            if self.id2label is not None:
+                cfg.model['id2label'] = self.id2label
+            if self.label2id is not None:
+                cfg.model['label2id'] = self.label2id
         return cfg
 
     def build_model(self) -> Union[nn.Module, TorchModel]:
@@ -203,6 +224,9 @@ class VecoTrainer(NlpEpochBasedTrainer):
 
         """
         from modelscope.msdatasets.task_datasets import VecoDataset
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
         metric_values = {}
@@ -223,12 +247,10 @@ class VecoTrainer(NlpEpochBasedTrainer):
                 self.eval_dataset, **self.cfg.evaluation.get('dataloader', {}))
             self.data_loader = self.eval_dataloader
 
-            metric_classes = [
-                build_metric(metric, default_args={'trainer': self})
-                for metric in self.metrics
-            ]
-            self.evaluation_loop(self.eval_dataloader, checkpoint_path,
-                                 metric_classes)
+            metric_classes = [build_metric(metric) for metric in self.metrics]
+            for m in metric_classes:
+                m.trainer = self
+            self.evaluation_loop(self.eval_dataloader, metric_classes)
 
             for m_idx, metric_cls in enumerate(metric_classes):
                 if f'eval_dataset[{idx}]' not in metric_values:
@@ -242,4 +264,8 @@ class VecoTrainer(NlpEpochBasedTrainer):
             else:
                 break
 
+        for metric_name in self.metrics:
+            metric_values[metric_name] = np.average(
+                [m[metric_name] for m in metric_values.values()])
+
         return metric_values
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d771d9d6..69645d07 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-import random
 import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
@@ -8,7 +7,6 @@ from functools import partial
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import json
-import numpy as np
 import torch
 from torch import distributed as dist
 from torch import nn
@@ -425,8 +423,16 @@ class EpochBasedTrainer(BaseTrainer):
             metrics = [metrics]
         return metrics
 
-    def train(self, *args, **kwargs):
-        self.model.train()
+    def set_checkpoint_file_to_hook(self, checkpoint_path):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            checkpoint_hooks = list(
+                filter(lambda hook: isinstance(hook, CheckpointHook),
+                       self.hooks))
+            for hook in checkpoint_hooks:
+                hook.checkpoint_file = checkpoint_path
+
+    def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
 
         if self.train_dataset is None:
@@ -442,13 +448,17 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.register_optimizers_hook()
         self.register_hook_from_cfg(self.cfg.train.hooks)
+        self.set_checkpoint_file_to_hook(checkpoint_path)
+        self.model.train()
 
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
-
         if self.eval_dataset is None:
             self.eval_dataloader = self.get_eval_data_loader()
         else:
@@ -462,8 +472,9 @@ class EpochBasedTrainer(BaseTrainer):
         metric_classes = [build_metric(metric) for metric in self.metrics]
         for m in metric_classes:
             m.trainer = self
+
         metric_values = self.evaluation_loop(self.eval_dataloader,
-                                             checkpoint_path, metric_classes)
+                                             metric_classes)
 
         self._metric_values = metric_values
         return metric_values
@@ -631,18 +642,13 @@ class EpochBasedTrainer(BaseTrainer):
         if hasattr(data_cfg, 'name'):
             dataset = MsDataset.load(
                 dataset_name=data_cfg.name,
-                split=data_cfg.split,
-                subset_name=data_cfg.subset_name if hasattr(
-                    data_cfg, 'subset_name') else None,
-                hub=data_cfg.hub
-                if hasattr(data_cfg, 'hub') else Hubs.modelscope,
                 **data_cfg,
             )
             cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
             torch_dataset = dataset.to_torch_dataset(
                 task_data_config=cfg,
                 task_name=self.cfg.task,
-                preprocessors=self.preprocessor)
+                preprocessors=preprocessor)
         else:
             torch_dataset = build_task_dataset(data_cfg, self.cfg.task)
         dataset = self.to_task_dataset(torch_dataset, mode)
@@ -802,19 +808,22 @@ class EpochBasedTrainer(BaseTrainer):
         """ Training loop used by `EpochBasedTrainer.train()`
         """
         self.invoke_hook(TrainerStages.before_run)
-        self._epoch = 0
         kwargs = {}
         self.model.train()
         for _ in range(self._epoch, self._max_epochs):
             self.invoke_hook(TrainerStages.before_train_epoch)
             time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
+                if i < self.inner_iter:
+                    # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.
+                    continue
                 data_batch = to_device(data_batch, self.device)
                 self.data_batch = data_batch
                 self._inner_iter = i
                 self.invoke_hook(TrainerStages.before_train_iter)
                 self.train_step(self.model, data_batch, **kwargs)
                 self.invoke_hook(TrainerStages.after_train_iter)
+                # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
                 del self.data_batch
                 self._iter += 1
                 self._mode = ModeKeys.TRAIN
@@ -823,12 +832,14 @@ class EpochBasedTrainer(BaseTrainer):
                     break
 
             self.invoke_hook(TrainerStages.after_train_epoch)
+            # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
+            self._inner_iter = 0
             self._epoch += 1
 
         time.sleep(1)  # wait for some hooks like loggers to finish
         self.invoke_hook(TrainerStages.after_run)
 
-    def evaluation_loop(self, data_loader, checkpoint_path, metric_classes):
+    def evaluation_loop(self, data_loader, metric_classes):
         """ Evaluation loop used by `EpochBasedTrainer.evaluate()`.
 
         """
@@ -841,7 +852,7 @@ class EpochBasedTrainer(BaseTrainer):
                 tmpdir=None,
                 gpu_collect=False,
                 metric_classes=metric_classes,
-                data_loader_iters_per_gpu=self.iters_per_epoch)
+                data_loader_iters_per_gpu=self._eval_iters_per_epoch)
         else:
             from modelscope.trainers.utils.inference import single_gpu_test
             metric_values = single_gpu_test(
@@ -849,7 +860,7 @@ class EpochBasedTrainer(BaseTrainer):
                 data_loader,
                 device=self.device,
                 metric_classes=metric_classes,
-                data_loader_iters=self.iters_per_epoch)
+                data_loader_iters=self._eval_iters_per_epoch)
 
         self._inner_iter = self.iters_per_epoch - 1  # start from index 0
 
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 425d3312..8d8c2b2f 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -8,14 +8,17 @@ from shutil import copytree, ignore_patterns, rmtree
 from typing import Callable, List, Optional, Union
 
 import json
-import numpy as np
 import torch
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 
 from modelscope import __version__
 from modelscope.fileio import File, LocalStorage
 from modelscope.utils.config import JSONIteratorEncoder
 from modelscope.utils.constant import ConfigFields, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
 
 storage = LocalStorage()
 
@@ -40,24 +43,27 @@ def weights_to_cpu(state_dict):
 def save_checkpoint(model: torch.nn.Module,
                     filename: str,
                     optimizer: Optional[Optimizer] = None,
+                    lr_scheduler: Optional[_LRScheduler] = None,
                     meta: Optional[dict] = None,
                     with_meta: bool = True) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
+    ``optimizer``. By default, ``meta`` will contain version and time info.
 
     Args:
         model (Module): Module whose params are to be saved.
         filename (str): Checkpoint filename.
         optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        lr_scheduler(:obj:`_LRScheduler`, optional): LRScheduler to be saved.
         meta (dict, optional): Metadata to be saved in checkpoint.
+        with_meta (bool, optional):
     """
     if meta is None:
         meta = {}
     elif not isinstance(meta, dict):
         raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(modescope=__version__, time=time.asctime())
+    meta.update(modelscope=__version__, time=time.asctime())
 
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         model = model.module
@@ -71,22 +77,69 @@ def save_checkpoint(model: torch.nn.Module,
             'meta': meta,
             'state_dict': weights_to_cpu(model.state_dict())
         }
+
+        # save optimizer state dict in the checkpoint
+        if isinstance(optimizer, Optimizer):
+            checkpoint['optimizer'] = optimizer.state_dict()
+        elif isinstance(optimizer, dict):
+            checkpoint['optimizer'] = {}
+            for name, optim in optimizer.items():
+                checkpoint['optimizer'][name] = optim.state_dict()
+
+        # save lr_scheduler state dict in the checkpoint
+        assert isinstance(lr_scheduler, _LRScheduler), \
+            f'lr_scheduler to be saved should be a subclass of _LRScheduler, current is : {lr_scheduler.__class__}'
+        checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
     else:
         checkpoint = weights_to_cpu(model.state_dict())
 
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
     with io.BytesIO() as f:
         torch.save(checkpoint, f)
         File.write(f.getvalue(), filename)
 
 
+def load_checkpoint(filename,
+                    model,
+                    optimizer: Optimizer = None,
+                    lr_scheduler: _LRScheduler = None):
+    if not os.path.exists(filename):
+        raise ValueError(f'Checkpoint file {filename} does not exist!')
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    if optimizer is not None:
+        if 'optimizer' in checkpoint:
+            if isinstance(optimizer, Optimizer):
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(optimizer, dict):
+                optimizer_dict = checkpoint['optimizer']
+                for key, optimizer_ins in optimizer.items():
+                    if key in optimizer_dict:
+                        optimizer_ins.load_state_dict(optimizer_dict[key])
+                    else:
+                        logger.warn(
+                            f'The state dict of optimizer {key} cannot be found in checkpoint file: {filename}'
+                        )
+        else:
+            logger.warn(
+                f'The state dict of optimizer cannot be found in checkpoint file: {filename}'
+            )
+
+    if lr_scheduler is not None:
+        if 'lr_scheduler' in checkpoint:
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        else:
+            logger.warn(
+                f'The state dict of lr_scheduler cannot be found in checkpoint file: {filename}'
+            )
+
+    state_dict = checkpoint if 'state_dict' not in checkpoint else checkpoint[
+        'state_dict']
+    model.load_state_dict(state_dict)
+
+    if 'meta' in checkpoint:
+        return checkpoint.get('meta', {})
+
+
 def save_pretrained(model,
                     target_folder: Union[str, os.PathLike],
                     save_checkpoint_name: str = None,
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 82267447..95d2beea 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -299,19 +299,23 @@ class MsRegressTool(RegressTool):
                          file_name,
                          level='config',
                          compare_fn=None,
-                         ignore_keys=None):
+                         ignore_keys=None,
+                         compare_random=True,
+                         lazy_stop_callback=None):
 
-        def lazy_stop_callback():
+        if lazy_stop_callback is None:
 
-            from modelscope.trainers.hooks.hook import Hook, Priority
+            def lazy_stop_callback():
 
-            class EarlyStopHook(Hook):
-                PRIORITY = Priority.VERY_LOW
+                from modelscope.trainers.hooks.hook import Hook, Priority
 
-                def after_iter(self, trainer):
-                    raise MsRegressTool.EarlyStopError('Test finished.')
+                class EarlyStopHook(Hook):
+                    PRIORITY = Priority.VERY_LOW
 
-            trainer.register_hook(EarlyStopHook())
+                    def after_iter(self, trainer):
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+                trainer.register_hook(EarlyStopHook())
 
         def _train_loop(trainer, *args, **kwargs):
             with self.monitor_module_train(
@@ -320,6 +324,7 @@ class MsRegressTool(RegressTool):
                     level,
                     compare_fn=compare_fn,
                     ignore_keys=ignore_keys,
+                    compare_random=compare_random,
                     lazy_stop_callback=lazy_stop_callback):
                 try:
                     return trainer.train_loop_origin(*args, **kwargs)
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 7889d944..b438e476 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,8 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
-from collections.abc import Mapping
-
-import numpy as np
 
 
 def torch_nested_numpify(tensors):
diff --git a/tests/trainers/data/test/regression/sbert-base-tnews.bin b/tests/trainers/data/test/regression/sbert-base-tnews.bin
deleted file mode 100644
index 3a06d49c..00000000
--- a/tests/trainers/data/test/regression/sbert-base-tnews.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2df2a5f3cdfc6dded52d31a8e97d9a9c41a803cb6d46dee709c51872eda37b21
-size 151830
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 2cf1c152..6030ada9 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -11,7 +11,8 @@ from modelscope.models.nlp.sequence_classification import \
     SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
-from modelscope.trainers import build_trainer
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
@@ -119,6 +120,90 @@ class TestTrainerWithNlp(unittest.TestCase):
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
         self.assertTrue(Metrics.accuracy in eval_results)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_configured_datasets(self):
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 20
+        cfg.train.work_dir = self.tmp_dir
+        cfg.dataset = {
+            'train': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+            'val': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+        }
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        kwargs = dict(model=model_id, cfg_file=cfg_file)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(cfg.train.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+        eval_results = trainer.evaluate(
+            checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
+        self.assertTrue(Metrics.accuracy in eval_results)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_continue_train(self):
+        from modelscope.utils.regress_test_utils import MsRegressTool
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 3
+        cfg.train.work_dir = self.tmp_dir
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
+        dataset = dataset.to_hf_dataset().select(range(128))
+        kwargs = dict(
+            model=model_id,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            cfg_file=cfg_file)
+
+        regress_tool = MsRegressTool(baseline=True)
+        trainer: EpochBasedTrainer = build_trainer(default_args=kwargs)
+
+        def lazy_stop_callback():
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    if trainer.iter == 12:
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+            if 'EarlyStopHook' not in [
+                    hook.__class__.__name__ for hook in trainer.hooks
+            ]:
+                trainer.register_hook(EarlyStopHook())
+
+        with regress_tool.monitor_ms_train(
+                trainer,
+                'trainer_continue_train',
+                level='strict',
+                lazy_stop_callback=lazy_stop_callback):
+            trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+        trainer = build_trainer(default_args=kwargs)
+        regress_tool = MsRegressTool(baseline=False)
+        with regress_tool.monitor_ms_train(
+                trainer, 'trainer_continue_train', level='strict'):
+            trainer.train(os.path.join(self.tmp_dir, 'iter_12.pth'))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         tmp_dir = tempfile.TemporaryDirectory().name

From 4442e68511fc191f1f5710de6f486f5d4f730b24 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 19 Sep 2022 17:30:16 +0800
Subject: [PATCH 109/175] [to #44902165] bump version to 0.4.2

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index f0ede3d3..a9873473 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.1'
+__version__ = '0.4.2'

From ddf8daf0a0c0ee296cea6df704648c176074df0a Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 19 Sep 2022 20:41:39 +0800
Subject: [PATCH 110/175] [to #42322933] Fix bug in release         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10173975

---
 modelscope/trainers/hooks/checkpoint_hook.py | 12 +++++++-----
 modelscope/utils/checkpoint.py               |  5 ++---
 modelscope/utils/hub.py                      |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index a9b793d4..220929b8 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -117,20 +117,20 @@ class CheckpointHook(Hook):
         for i, hook in enumerate(trainer.hooks):
             # hook: Hook
             key = f'{hook.__class__}-{i}'
-            if key in meta:
+            if key in meta and hasattr(hook, 'load_state_dict'):
                 hook.load_state_dict(meta[key])
             else:
-                trainer.logger(
+                trainer.logger.warn(
                     f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
                 )
 
         version = meta.get('modelscope')
         if version != __version__:
-            trainer.logger(
+            trainer.logger.warn(
                 f'The modelscope version of loaded checkpoint does not match the runtime version. '
                 f'The saved version: {version}, runtime version: {__version__}'
             )
-        trainer.logger(
+        trainer.logger.warn(
             f'Checkpoint {filename} saving time: {meta.get("time")}')
         return meta
 
@@ -149,7 +149,8 @@ class CheckpointHook(Hook):
             'rng_state': self.rng_state,
         }
         for i, hook in enumerate(trainer.hooks):
-            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+            if hasattr(hook, 'state_dict'):
+                meta[f'{hook.__class__}-{i}'] = hook.state_dict()
 
         save_checkpoint(
             trainer.model,
@@ -239,6 +240,7 @@ class BestCkptSaverHook(CheckpointHook):
         self.rule = rule
         self._best_metric = None
         self._best_ckpt_file = None
+        self.save_file_name = save_file_name
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 8d8c2b2f..a9d7f396 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -87,9 +87,8 @@ def save_checkpoint(model: torch.nn.Module,
                 checkpoint['optimizer'][name] = optim.state_dict()
 
         # save lr_scheduler state dict in the checkpoint
-        assert isinstance(lr_scheduler, _LRScheduler), \
-            f'lr_scheduler to be saved should be a subclass of _LRScheduler, current is : {lr_scheduler.__class__}'
-        checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
+        if lr_scheduler is not None and hasattr(lr_scheduler, 'state_dict'):
+            checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
     else:
         checkpoint = weights_to_cpu(model.state_dict())
 
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index cf114b5e..2dbe7045 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -142,8 +142,8 @@ def parse_label_mapping(model_dir):
             id2label = config[ConfigFields.preprocessor].id2label
             label2id = {label: id for id, label in id2label.items()}
 
-    if label2id is None:
-        config_path = os.path.join(model_dir, 'config.json')
+    config_path = os.path.join(model_dir, 'config.json')
+    if label2id is None and os.path.exists(config_path):
         config = Config.from_file(config_path)
         if hasattr(config, 'label2id'):
             label2id = config.label2id

From 2c05a349240aa891fc9b6fbe3eb463cdb1443172 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 19 Sep 2022 21:30:31 +0800
Subject: [PATCH 111/175] [to #42322933] bug fix for fairseq

---
 modelscope/preprocessors/__init__.py          |  4 +-
 modelscope/preprocessors/nlp/__init__.py      | 46 ++++++++++++++++
 .../preprocessors/{nlp.py => nlp/nlp_base.py} | 52 ++-----------------
 .../nlp/text_error_correction.py              | 50 ++++++++++++++++++
 4 files changed, 104 insertions(+), 48 deletions(-)
 create mode 100644 modelscope/preprocessors/nlp/__init__.py
 rename modelscope/preprocessors/{nlp.py => nlp/nlp_base.py} (96%)
 create mode 100644 modelscope/preprocessors/nlp/text_error_correction.py

diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 04901dc5..ba03a35e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -24,7 +24,8 @@ if TYPE_CHECKING:
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
         DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor)
+        PassageRankingPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -56,6 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
             'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
         'space': [
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
new file mode 100644
index 00000000..eee5e80f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .text_error_correction import TextErrorCorrectionPreprocessor
+    from .nlp_base import (
+        Tokenize, SequenceClassificationPreprocessor,
+        TextGenerationPreprocessor, TokenClassificationPreprocessor,
+        SingleSentenceClassificationPreprocessor,
+        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
+        ZeroShotClassificationPreprocessor, NERPreprocessor,
+        FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
+        RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
+        FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor)
+
+else:
+    _import_structure = {
+        'nlp_base': [
+            'Tokenize', 'SequenceClassificationPreprocessor',
+            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
+            'SingleSentenceClassificationPreprocessor',
+            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
+            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
+            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
+            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'RelationExtractionPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
+            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+        ],
+        'text_error_correction': [
+            'TextErrorCorrectionPreprocessor',
+        ],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp/nlp_base.py
similarity index 96%
rename from modelscope/preprocessors/nlp.py
rename to modelscope/preprocessors/nlp/nlp_base.py
index e20adaa6..0a2495af 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -6,20 +6,19 @@ import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-import torch
 from transformers import AutoTokenizer, BertTokenizerFast
 
 from modelscope.metainfo import Models, Preprocessors
 from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp import import_external_nltk_data
 from modelscope.utils.type_assert import type_assert
-from .base import Preprocessor
-from .builder import PREPROCESSORS
 
 logger = get_logger()
 
@@ -30,9 +29,9 @@ __all__ = [
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
-    'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor',
-    'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+    'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+    'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor',
+    'FillMaskPoNetPreprocessor'
 ]
 
 
@@ -889,47 +888,6 @@ class RelationExtractionPreprocessor(Preprocessor):
         }
 
 
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(Preprocessor):
-    """The preprocessor used in text correction task.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        from fairseq.data import Dictionary
-        """preprocess the data via the vocab file from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-        super().__init__(*args, **kwargs)
-        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    '随着中国经济突飞猛近，建造工业与日俱增'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
-        """
-
-        text = ' '.join([x for x in data])
-        inputs = self.vocab.encode_line(
-            text, append_eos=True, add_if_not_exist=False)
-        lengths = inputs.size()
-        sample = dict()
-        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
-        return sample
-
-
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
 class FaqQuestionAnsweringPreprocessor(Preprocessor):
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
new file mode 100644
index 00000000..357a946f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_error_correction)
+class TextErrorCorrectionPreprocessor(Preprocessor):
+    """The preprocessor used in text correction task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from fairseq.data import Dictionary
+        """preprocess the data via the vocab file from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '随着中国经济突飞猛近，建造工业与日俱增'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+
+        text = ' '.join([x for x in data])
+        inputs = self.vocab.encode_line(
+            text, append_eos=True, add_if_not_exist=False)
+        lengths = inputs.size()
+        sample = dict()
+        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
+        return sample

From 12b8f5d04b799146376380a4293b2241c7d40e6f Mon Sep 17 00:00:00 2001
From: "yuanzhi.zyz" <yuanzhi.zyz@alibaba-inc.com>
Date: Tue, 20 Sep 2022 15:29:57 +0800
Subject: [PATCH 112/175] [to #42322933]fix cpu-used bug         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10184817

---
 modelscope/pipelines/cv/ocr_recognition_pipeline.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index 4b095042..c20d020c 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -91,7 +91,8 @@ class OCRRecognitionPipeline(Pipeline):
                 data.append(mask)
 
         data = torch.FloatTensor(data).view(
-            len(data), 1, IMG_HEIGHT, IMG_WIDTH).cuda() / 255.
+            len(data), 1, IMG_HEIGHT, IMG_WIDTH) / 255.
+        data = data.to(self.device)
 
         result = {'img': data}
 

From 1eedbd65bcc2c49b9db91a5be5549bdf66092f33 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Tue, 20 Sep 2022 15:53:38 +0800
Subject: [PATCH 113/175] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E5=A4=8Dshop?=
 =?UTF-8?q?=20segmentation=20CPU=20Inference=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复CPU Inference错误，支持CPU inference
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10177721
---
 modelscope/models/cv/shop_segmentation/models.py |  2 +-
 .../cv/shop_segmentation/shop_seg_model.py       | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index 8b82d1d1..171aafbd 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -552,7 +552,7 @@ class CLIPVisionTransformer(nn.Module):
                 nn.GroupNorm(1, embed_dim),
                 nn.ConvTranspose2d(
                     embed_dim, embed_dim, kernel_size=2, stride=2),
-                nn.SyncBatchNorm(embed_dim),
+                nn.BatchNorm2d(embed_dim),
                 nn.GELU(),
                 nn.ConvTranspose2d(
                     embed_dim, embed_dim, kernel_size=2, stride=2),
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
index 409c583b..0aeeb1de 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -33,18 +33,18 @@ class ShopSegmentation(TorchModel):
             model_dir=model_dir, device_id=device_id, *args, **kwargs)
 
         self.model = SHOPSEG(model_dir=model_dir)
-        pretrained_params = torch.load('{}/{}'.format(
-            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
-
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
         self.model.load_state_dict(pretrained_params)
         self.model.eval()
-        self.device_id = device_id
-        if self.device_id >= 0 and torch.cuda.is_available():
-            self.model.to('cuda:{}'.format(self.device_id))
-            logger.info('Use GPU: {}'.format(self.device_id))
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
         else:
-            self.device_id = -1
+            device_id = -1
             logger.info('Use CPU for inference')
+        self.device_id = device_id
 
     def preprocess(self, img, size=1024):
         mean = [0.48145466, 0.4578275, 0.40821073]

From 6808e9a301557073e0c4345654df85ebce8e1698 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 20 Sep 2022 17:49:31 +0800
Subject: [PATCH 114/175] [to #44902099] add license for framework files

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10189613
---
 modelscope/fileio/__init__.py                                 | 2 ++
 modelscope/fileio/format/__init__.py                          | 2 ++
 modelscope/hub/api.py                                         | 2 ++
 modelscope/hub/constants.py                                   | 2 ++
 modelscope/hub/errors.py                                      | 2 ++
 modelscope/hub/file_download.py                               | 2 ++
 modelscope/hub/git.py                                         | 2 ++
 modelscope/hub/repository.py                                  | 2 ++
 modelscope/hub/snapshot_download.py                           | 2 ++
 modelscope/hub/utils/caching.py                               | 2 ++
 modelscope/hub/utils/utils.py                                 | 2 ++
 modelscope/models/base/__init__.py                            | 2 ++
 modelscope/msdatasets/ms_dataset.py                           | 2 ++
 modelscope/msdatasets/utils/dataset_builder.py                | 2 ++
 modelscope/msdatasets/utils/dataset_utils.py                  | 2 ++
 modelscope/msdatasets/utils/download_utils.py                 | 2 ++
 modelscope/msdatasets/utils/oss_utils.py                      | 2 ++
 modelscope/msdatasets/utils/upload_utils.py                   | 2 ++
 .../multi_modal/generative_multi_modal_embedding_pipeline.py  | 2 ++
 .../pipelines/multi_modal/multi_modal_embedding_pipeline.py   | 2 ++
 .../multi_modal/team_multi_modal_similarity_pipeline.py       | 2 ++
 .../pipelines/multi_modal/text_to_image_synthesis_pipeline.py | 2 ++
 .../multi_modal/video_multi_modal_embedding_pipeline.py       | 2 ++
 modelscope/utils/ast_utils.py                                 | 2 ++
 modelscope/utils/config.py                                    | 4 +++-
 modelscope/utils/config_ds.py                                 | 2 ++
 modelscope/utils/cv/image_utils.py                            | 2 ++
 modelscope/utils/demo_utils.py                                | 2 ++
 modelscope/utils/model_tag.py                                 | 2 ++
 modelscope/utils/regress_test_utils.py                        | 2 ++
 modelscope/utils/type_assert.py                               | 2 ++
 tests/hub/test_hub_examples.py                                | 2 ++
 tests/hub/test_utils.py                                       | 2 ++
 tests/msdatasets/test_ms_dataset.py                           | 2 ++
 tests/pipelines/test_animal_recognition.py                    | 2 ++
 tests/pipelines/test_general_image_classification.py          | 2 ++
 tests/pipelines/test_general_recognition.py                   | 2 ++
 tests/pipelines/test_image_panoptic_segmentation.py           | 2 ++
 tests/pipelines/test_image_semantic_segmentation.py           | 2 ++
 tests/pipelines/test_key_word_spotting_farfield.py            | 2 ++
 tests/pipelines/test_product_retrieval_embedding.py           | 2 ++
 tests/pipelines/test_speech_signal_process.py                 | 2 ++
 tests/pipelines/test_text_to_speech.py                        | 2 ++
 tests/pipelines/test_tinynas_classification.py                | 2 ++
 tests/pipelines/test_tinynas_detection.py                     | 2 ++
 tests/pipelines/test_virtual_try_on.py                        | 2 ++
 tests/trainers/audio/test_ans_trainer.py                      | 2 ++
 tests/trainers/test_dialog_intent_trainer.py                  | 2 ++
 tests/utils/__init__.py                                       | 2 ++
 tests/utils/profiler.py                                       | 2 ++
 50 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index b526d593..385cd02c 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,2 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .file import File, LocalStorage
 from .io import dump, dumps, load
diff --git a/modelscope/fileio/format/__init__.py b/modelscope/fileio/format/__init__.py
index 52e64279..68518266 100644
--- a/modelscope/fileio/format/__init__.py
+++ b/modelscope/fileio/format/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base import FormatHandler
 from .json import JsonHandler
 from .yaml import YamlHandler
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 85da6a31..8dcfa5b0 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import pickle
 import shutil
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 014a1e59..c8664597 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from pathlib import Path
 
 MODELSCOPE_URL_SCHEME = 'http://'
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 284dbed4..c095a6ec 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from http import HTTPStatus
 
 from requests.exceptions import HTTPError
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 5f15272c..1cc5645b 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import copy
 import os
 import sys
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 13e1910d..486f8df3 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import subprocess
 from typing import List
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 8d5fd30b..d92089ed 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Optional
 
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index c63d8956..cde6ad34 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import tempfile
 from pathlib import Path
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index fc30fa27..1acd2e84 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
 import pickle
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 7e219d16..d84b78ea 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
 from typing import Optional
diff --git a/modelscope/models/base/__init__.py b/modelscope/models/base/__init__.py
index ab7901af..8c47ecaf 100644
--- a/modelscope/models/base/__init__.py
+++ b/modelscope/models/base/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base_head import *  # noqa F403
 from .base_model import *  # noqa F403
 from .base_torch_head import *  # noqa F403
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 58957234..0fb877b7 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 825400c4..0548f7b9 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Mapping, Sequence, Union
 
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 769bed93..ef42f75f 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from collections import defaultdict
 from typing import Any, Mapping, Optional, Sequence, Union
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index eb1c99ef..2e21bf50 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Optional
 
 from datasets.utils.download_manager import DownloadConfig, DownloadManager
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 9a7040a1..4a403876 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import print_function
 import os
 
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index fbe5c531..4813b89f 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .oss_utils import OssUtilities
 
 
diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
index f5a180b6..d3b9fef3 100644
--- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
index d15970d2..76011be0 100644
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
index 7d3ffed3..fc123e2f 100644
--- a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
index f402cc29..7516c5be 100644
--- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional
 
 import torch
diff --git a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
index bc697b05..3a9284f1 100644
--- a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index cdafafd3..f59100cb 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import ast
 import contextlib
 import hashlib
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 7d972118..0b966bef 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright (c) OpenMMLab. All rights reserved.
+# Major implementation is borrowed and modified from
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
 
 import copy
 import os
diff --git a/modelscope/utils/config_ds.py b/modelscope/utils/config_ds.py
index bafe3f99..fce823c4 100644
--- a/modelscope/utils/config_ds.py
+++ b/modelscope/utils/config_ds.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from pathlib import Path
 
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 9ec2c4f3..98ba533e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import cv2
 import numpy as np
 
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 93535c1e..41ac0bca 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 
 import cv2
diff --git a/modelscope/utils/model_tag.py b/modelscope/utils/model_tag.py
index 9c494eac..7065e8f3 100644
--- a/modelscope/utils/model_tag.py
+++ b/modelscope/utils/model_tag.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import logging
 import os
 
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 95d2beea..8b6c24a7 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import contextlib
 import hashlib
 import os
diff --git a/modelscope/utils/type_assert.py b/modelscope/utils/type_assert.py
index aaeadcb9..f732a81a 100644
--- a/modelscope/utils/type_assert.py
+++ b/modelscope/utils/type_assert.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from functools import wraps
 from inspect import signature
 
diff --git a/tests/hub/test_hub_examples.py b/tests/hub/test_hub_examples.py
index 3fb6823f..d1f7594e 100644
--- a/tests/hub/test_hub_examples.py
+++ b/tests/hub/test_hub_examples.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.hub.api import HubApi
diff --git a/tests/hub/test_utils.py b/tests/hub/test_utils.py
index 38a74fd4..3d312dc0 100644
--- a/tests/hub/test_utils.py
+++ b/tests/hub/test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 from codecs import ignore_errors
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 9780ac4b..762530f4 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.models import Model
diff --git a/tests/pipelines/test_animal_recognition.py b/tests/pipelines/test_animal_recognition.py
index 7d5f0561..eb9f92e6 100644
--- a/tests/pipelines/test_animal_recognition.py
+++ b/tests/pipelines/test_animal_recognition.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
diff --git a/tests/pipelines/test_general_image_classification.py b/tests/pipelines/test_general_image_classification.py
index b35f3696..d5357f02 100644
--- a/tests/pipelines/test_general_image_classification.py
+++ b/tests/pipelines/test_general_image_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
diff --git a/tests/pipelines/test_general_recognition.py b/tests/pipelines/test_general_recognition.py
index cbcb927b..ba713bbe 100644
--- a/tests/pipelines/test_general_recognition.py
+++ b/tests/pipelines/test_general_recognition.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
index a1657585..4f12e6af 100644
--- a/tests/pipelines/test_image_panoptic_segmentation.py
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
index c7876906..286d317a 100644
--- a/tests/pipelines/test_image_semantic_segmentation.py
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index 1b23a6a7..fea7afd7 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
 import unittest
 
diff --git a/tests/pipelines/test_product_retrieval_embedding.py b/tests/pipelines/test_product_retrieval_embedding.py
index 235847be..2483d53a 100644
--- a/tests/pipelines/test_product_retrieval_embedding.py
+++ b/tests/pipelines/test_product_retrieval_embedding.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import numpy as np
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 517facae..e5f97c02 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
 import unittest
 
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 374f0fd2..e82cf43e 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
diff --git a/tests/pipelines/test_tinynas_classification.py b/tests/pipelines/test_tinynas_classification.py
index 204b8bdb..ebc6b722 100644
--- a/tests/pipelines/test_tinynas_classification.py
+++ b/tests/pipelines/test_tinynas_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index b13644be..63db9145 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
diff --git a/tests/pipelines/test_virtual_try_on.py b/tests/pipelines/test_virtual_try_on.py
index e1dd78a2..5c18dcc4 100644
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index ed8cd1fe..c0860529 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 import tempfile
diff --git a/tests/trainers/test_dialog_intent_trainer.py b/tests/trainers/test_dialog_intent_trainer.py
index b183a690..207387ac 100644
--- a/tests/trainers/test_dialog_intent_trainer.py
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 import tempfile
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
index 9166292f..f1a50035 100644
--- a/tests/utils/__init__.py
+++ b/tests/utils/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .profiler import *  # noqa F403
diff --git a/tests/utils/profiler.py b/tests/utils/profiler.py
index 92708ad3..f5a522ef 100644
--- a/tests/utils/profiler.py
+++ b/tests/utils/profiler.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import sys
 from functools import wraps

From 4a9dfbf09540eba7da22291fef44accefe3b4093 Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Tue, 20 Sep 2022 18:39:43 +0800
Subject: [PATCH 115/175] [to #42322933] fix video inpainting cpu inference    
     Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10186204

---
 .../models/cv/video_inpainting/__init__.py     |  2 +-
 .../models/cv/video_inpainting/inpainting.py   |  7 ++++---
 .../cv/video_inpainting/inpainting_model.py    | 18 +++++++++++++-----
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/modelscope/models/cv/video_inpainting/__init__.py b/modelscope/models/cv/video_inpainting/__init__.py
index fd93fe3c..f5489da9 100644
--- a/modelscope/models/cv/video_inpainting/__init__.py
+++ b/modelscope/models/cv/video_inpainting/__init__.py
@@ -1,4 +1,4 @@
-# copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/video_inpainting/inpainting.py b/modelscope/models/cv/video_inpainting/inpainting.py
index 9632e01c..e2af2ad0 100644
--- a/modelscope/models/cv/video_inpainting/inpainting.py
+++ b/modelscope/models/cv/video_inpainting/inpainting.py
@@ -1,6 +1,6 @@
 """ VideoInpaintingProcess
-Base modules are adapted from https://github.com/researchmm/STTN,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+The implementation here is modified based on STTN,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
 """
 
 import os
@@ -243,7 +243,8 @@ def inpainting_by_model_balance(model, video_inputPath, mask_path,
                 for m in masks_temp
             ]
             masks_temp = _to_tensors(masks_temp).unsqueeze(0)
-            feats_temp, masks_temp = feats_temp.cuda(), masks_temp.cuda()
+            if torch.cuda.is_available():
+                feats_temp, masks_temp = feats_temp.cuda(), masks_temp.cuda()
             comp_frames = [None] * video_length
             model.eval()
             with torch.no_grad():
diff --git a/modelscope/models/cv/video_inpainting/inpainting_model.py b/modelscope/models/cv/video_inpainting/inpainting_model.py
index a791b0ab..ffecde67 100644
--- a/modelscope/models/cv/video_inpainting/inpainting_model.py
+++ b/modelscope/models/cv/video_inpainting/inpainting_model.py
@@ -1,15 +1,18 @@
-""" VideoInpaintingNetwork
-Base modules are adapted from https://github.com/researchmm/STTN,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+""" VideoInpaintingProcess
+The implementation here is modified based on STTN,
+ originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
 """
 
 import math
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torchvision.models as models
 
 from modelscope.metainfo import Models
+from modelscope.models import Model
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
@@ -84,8 +87,13 @@ class VideoInpainting(TorchModel):
         super().__init__(
             model_dir=model_dir, device_id=device_id, *args, **kwargs)
         self.model = InpaintGenerator()
-        pretrained_params = torch.load('{}/{}'.format(
-            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        if torch.cuda.is_available():
+            device = 'cuda'
+        else:
+            device = 'cpu'
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=device)
         self.model.load_state_dict(pretrained_params['netG'])
         self.model.eval()
         self.device_id = device_id

From a8665cc8c550d9783ae426bd3383d9440e8cc68a Mon Sep 17 00:00:00 2001
From: "siyang.ssy" <siyang.ssy@alibaba-inc.com>
Date: Tue, 20 Sep 2022 20:01:16 +0800
Subject: [PATCH 116/175] fix video_multi_model_embedding         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10191199

---
 .../multi_modal/mmr/models/clip_for_mm_video_embedding.py    | 5 ++++-
 modelscope/models/multi_modal/mmr/models/modeling.py         | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 4e959a17..8d13e745 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -42,7 +42,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
         self.max_frames = model_config['max_frames']
         self.feature_framerate = model_config['feature_framerate']
         self.image_resolution = 224
-        self.device = model_config['device']
+        if torch.cuda.is_available():
+            self.device = model_config['device']
+        else:
+            self.device = 'cpu'
         self.init_model = f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}'
 
         self.tokenizer = ClipTokenizer(model_dir)
diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py
index 214e65c7..21cc4c80 100644
--- a/modelscope/models/multi_modal/mmr/models/modeling.py
+++ b/modelscope/models/multi_modal/mmr/models/modeling.py
@@ -85,9 +85,6 @@ class CLIP4Clip(nn.Module):
             linear_patch=config['linear_patch'],
             use_gc=config['use_gc']).float()
 
-        if (platform.system() != 'Darwin'):
-            convert_weights(self.clip)  # fp16
-
         if backbone in ['ViT-B/32', 'ViT-B/16']:
             cross_config = SimpleNamespace(**{
                 'hidden_size': 512,

From 6222804c381534ca9d7764fa3e67db59d2442468 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 20 Sep 2022 20:07:36 +0800
Subject: [PATCH 117/175] [to #44902165] bump version to 0.4.3

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index a9873473..908c0bb7 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.2'
+__version__ = '0.4.3'

From 90b214b2e3e37b1c8a5c722435570df2ad0964f6 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Wed, 21 Sep 2022 12:53:33 +0800
Subject: [PATCH 118/175] [to #42322933] add copyright information for audio
 modules         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10186837

---
 modelscope/metrics/audio_noise_metric.py             |  2 ++
 modelscope/models/audio/aec/layers/activations.py    |  2 ++
 .../models/audio/aec/layers/affine_transform.py      |  2 ++
 modelscope/models/audio/aec/layers/deep_fsmn.py      |  2 ++
 modelscope/models/audio/aec/layers/layer_base.py     |  2 ++
 modelscope/models/audio/aec/layers/uni_deep_fsmn.py  |  2 ++
 modelscope/models/audio/aec/network/loss.py          |  2 ++
 .../models/audio/aec/network/modulation_loss.py      |  2 ++
 modelscope/models/audio/aec/network/se_net.py        |  2 ++
 modelscope/models/audio/ans/complex_nn.py            | 11 ++++++-----
 modelscope/models/audio/ans/unet.py                  | 12 +++++++-----
 modelscope/models/audio/kws/farfield/fsmn.py         |  2 ++
 modelscope/models/audio/kws/farfield/fsmn_sele_v2.py |  2 ++
 modelscope/models/audio/kws/farfield/model.py        |  2 ++
 modelscope/models/audio/kws/farfield/model_def.py    |  2 ++
 modelscope/pipelines/audio/ans_pipeline.py           |  2 ++
 modelscope/pipelines/audio/kws_farfield_pipeline.py  |  2 ++
 modelscope/pipelines/audio/linear_aec_pipeline.py    |  2 ++
 modelscope/preprocessors/audio.py                    |  2 ++
 19 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
index 16c5261f..f26db46d 100644
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 from modelscope.metainfo import Metrics
diff --git a/modelscope/models/audio/aec/layers/activations.py b/modelscope/models/audio/aec/layers/activations.py
index b0215bcc..f78ad4b5 100644
--- a/modelscope/models/audio/aec/layers/activations.py
+++ b/modelscope/models/audio/aec/layers/activations.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch.nn as nn
 
 from .layer_base import LayerBase
diff --git a/modelscope/models/audio/aec/layers/affine_transform.py b/modelscope/models/audio/aec/layers/affine_transform.py
index 33479505..2de8a03f 100644
--- a/modelscope/models/audio/aec/layers/affine_transform.py
+++ b/modelscope/models/audio/aec/layers/affine_transform.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/deep_fsmn.py b/modelscope/models/audio/aec/layers/deep_fsmn.py
index 72ba07dc..1582b908 100644
--- a/modelscope/models/audio/aec/layers/deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/layer_base.py b/modelscope/models/audio/aec/layers/layer_base.py
index e56c4bc0..7c39e5be 100644
--- a/modelscope/models/audio/aec/layers/layer_base.py
+++ b/modelscope/models/audio/aec/layers/layer_base.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import abc
 import re
 
diff --git a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
index c22460c4..a276db05 100644
--- a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/network/loss.py b/modelscope/models/audio/aec/network/loss.py
index 743661b3..1f20072a 100644
--- a/modelscope/models/audio/aec/network/loss.py
+++ b/modelscope/models/audio/aec/network/loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn.functional as F
 
diff --git a/modelscope/models/audio/aec/network/modulation_loss.py b/modelscope/models/audio/aec/network/modulation_loss.py
index a45ddead..3017b5c6 100644
--- a/modelscope/models/audio/aec/network/modulation_loss.py
+++ b/modelscope/models/audio/aec/network/modulation_loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 
 import torch
diff --git a/modelscope/models/audio/aec/network/se_net.py b/modelscope/models/audio/aec/network/se_net.py
index 837cad3c..40639605 100644
--- a/modelscope/models/audio/aec/network/se_net.py
+++ b/modelscope/models/audio/aec/network/se_net.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index 9768eff7..beaa3187 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,9 +1,10 @@
-"""
-The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
- here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
-and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation of class ComplexConv2d, ComplexConvTranspose2d and
+# ComplexBatchNorm2d here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr
+# / Seoul National Univ., ESTsoft ) and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 
-"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index 3a9c5549..7b4df1e9 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,8 +1,10 @@
-"""
-The implementation here is modified based on
- Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
-and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation here is modified based on
+# Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+# and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/audio/kws/farfield/fsmn.py b/modelscope/models/audio/kws/farfield/fsmn.py
index e88d3976..e06d7911 100644
--- a/modelscope/models/audio/kws/farfield/fsmn.py
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
index 1884e533..8af16cc9 100644
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index 428ec367..fea82194 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Dict
 
diff --git a/modelscope/models/audio/kws/farfield/model_def.py b/modelscope/models/audio/kws/farfield/model_def.py
index 3f5ba7d7..be9cca2c 100644
--- a/modelscope/models/audio/kws/farfield/model_def.py
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import struct
 from enum import Enum
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 62399684..e55f613e 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index 62848a27..62f58fee 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import wave
 from typing import Any, Dict
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index 0e73b697..e1e75ddb 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import os
 from typing import Any, Dict
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index dd2f1fc1..1e659218 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import os
 from typing import Any, Dict, Tuple, Union

From 83ab586a6537a71512931d0d5c0c145e6532bd74 Mon Sep 17 00:00:00 2001
From: "yuxiang.tyx" <yuxiang.tyx@alibaba-inc.com>
Date: Wed, 21 Sep 2022 12:56:08 +0800
Subject: [PATCH 119/175] [to #42322933]license supplement for face_detection
 andd face_recognition         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10192162

    * license supplement for face_detection andd face_recognition
---
 modelscope/models/cv/face_detection/mmdet_patch/__init__.py  | 5 ++---
 .../cv/face_detection/mmdet_patch/core/bbox/__init__.py      | 4 ++++
 .../cv/face_detection/mmdet_patch/core/bbox/transforms.py    | 3 ++-
 .../mmdet_patch/core/post_processing/__init__.py             | 4 ++++
 .../mmdet_patch/core/post_processing/bbox_nms.py             | 3 ++-
 .../cv/face_detection/mmdet_patch/datasets/__init__.py       | 4 ++++
 .../mmdet_patch/datasets/pipelines/__init__.py               | 4 ++++
 .../mmdet_patch/datasets/pipelines/transforms.py             | 3 ++-
 .../cv/face_detection/mmdet_patch/datasets/retinaface.py     | 3 ++-
 .../models/cv/face_detection/mmdet_patch/models/__init__.py  | 4 ++++
 .../face_detection/mmdet_patch/models/backbones/__init__.py  | 4 ++++
 .../cv/face_detection/mmdet_patch/models/backbones/resnet.py | 3 ++-
 .../mmdet_patch/models/dense_heads/__init__.py               | 4 ++++
 .../mmdet_patch/models/dense_heads/scrfd_head.py             | 3 ++-
 .../face_detection/mmdet_patch/models/detectors/__init__.py  | 4 ++++
 .../cv/face_detection/mmdet_patch/models/detectors/scrfd.py  | 3 ++-
 modelscope/models/cv/face_recognition/align_face.py          | 4 ++++
 .../models/cv/face_recognition/torchkit/backbone/__init__.py | 2 ++
 .../models/cv/face_recognition/torchkit/backbone/common.py   | 2 ++
 .../cv/face_recognition/torchkit/backbone/model_irse.py      | 4 ++--
 .../cv/face_recognition/torchkit/backbone/model_resnet.py    | 4 ++--
 modelscope/pipelines/cv/face_detection_pipeline.py           | 1 +
 modelscope/pipelines/cv/face_recognition_pipeline.py         | 1 +
 23 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
index 921bdc08..5a895582 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
@@ -1,5 +1,4 @@
 """
-mmdet_patch is based on
-https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet,
-all duplicate functions from official mmdetection are removed.
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet
 """
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
index 8375649c..cf1b7313 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox
+"""
 from .transforms import bbox2result, distance2kps, kps2distance
 
 __all__ = ['bbox2result', 'distance2kps', 'kps2distance']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
index 26278837..d65480eb 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
index 8cd31348..61602fd3 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+"""
 from .bbox_nms import multiclass_nms
 
 __all__ = ['multiclass_nms']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
index efe8813f..7a4f5b3a 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 import torch
 
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
index 07a45208..cea179b0 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets
+"""
 from .retinaface import RetinaFaceDataset
 
 __all__ = ['RetinaFaceDataset']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
index 979212a3..85288910 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
+"""
 from .transforms import RandomSquareCrop
 
 __all__ = ['RandomSquareCrop']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
index 3048cefa..241f2c0e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
index bf20764b..bbacd9be 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 """
 import numpy as np
 from mmdet.datasets.builder import DATASETS
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
index 38c8ff5b..bd5d5f5f 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
@@ -1,2 +1,6 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models
+"""
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
index 2d930bf4..5c3b190e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
+"""
 from .resnet import ResNetV1e
 
 __all__ = ['ResNetV1e']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
index 54bcb127..a5862a58 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 """
 import torch.nn as nn
 import torch.utils.checkpoint as cp
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
index e67031bc..9ba63b68 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads
+"""
 from .scrfd_head import SCRFDHead
 
 __all__ = ['SCRFDHead']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
index 1667f29f..acc45670 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
index 1c16028f..7935606a 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
+"""
 from .scrfd import SCRFD
 
 __all__ = ['SCRFD']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
index 98b6702c..a5f5cac2 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
diff --git a/modelscope/models/cv/face_recognition/align_face.py b/modelscope/models/cv/face_recognition/align_face.py
index a6469a10..0477375a 100644
--- a/modelscope/models/cv/face_recognition/align_face.py
+++ b/modelscope/models/cv/face_recognition/align_face.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py
+"""
 import cv2
 import numpy as np
 from skimage import transform as trans
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
index a58d8e17..afe89963 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                          IR_SE_101, IR_SE_152, IR_SE_200)
 from .model_resnet import ResNet_50, ResNet_101, ResNet_152
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
index 426d2591..a1683225 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py
 import torch
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Linear, Module, ReLU,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
index 4fb7ee9c..1982ca05 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_irse.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py
 from collections import namedtuple
 
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
index 7072f384..568e24ff 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_resnet.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                       MaxPool2d, Module, ReLU, Sequential)
diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py
index 8fda5b46..eff5b70f 100644
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 506346df..873e4a1f 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 

From 4c26a5a757ae0b89be4ddb76b6b26c814e9fc5fa Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Wed, 21 Sep 2022 13:01:29 +0800
Subject: [PATCH 120/175] [to #42322933] add license header for
 cv/action-detection         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10198112

    * add lic
---
 modelscope/models/cv/action_detection/action_detection_onnx.py | 2 ++
 modelscope/pipelines/cv/action_recognition_pipeline.py         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
index 3c171473..1c8be354 100644
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import os.path as osp
 import shutil
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index e3400ea7..7f1a46b2 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os.path as osp
 from typing import Any, Dict

From 9e66f64858acb8b7fc36285638e5267d6008b2f7 Mon Sep 17 00:00:00 2001
From: "lee.lcy" <lee.lcy@alibaba-inc.com>
Date: Wed, 21 Sep 2022 13:07:10 +0800
Subject: [PATCH 121/175] [to #42322933] add license header for
 image_reid_person         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10190901

---
 modelscope/models/cv/image_reid_person/pass_model.py      | 2 +-
 modelscope/models/cv/image_reid_person/transreid_model.py | 2 +-
 modelscope/pipelines/cv/image_reid_person_pipeline.py     | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py
index 2222fedb..3b032949 100644
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import os
diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py
index 275c4e22..5bceb468 100644
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import collections.abc as container_abcs
diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py
index a14666a1..64674a65 100644
--- a/modelscope/pipelines/cv/image_reid_person_pipeline.py
+++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os
 from typing import Any, Dict

From 7f2a781a47511a47e715075c527b869c3b968061 Mon Sep 17 00:00:00 2001
From: "lanjinpeng.ljp" <lanjinpeng.ljp@alibaba-inc.com>
Date: Wed, 21 Sep 2022 13:08:05 +0800
Subject: [PATCH 122/175] [to #42322933]fix file header in
 video_single_object_tracking         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10195146

---
 .../models/cv/video_single_object_tracking/config/ostrack.py   | 3 ++-
 .../cv/video_single_object_tracking/models/layers/attn.py      | 3 ++-
 .../video_single_object_tracking/models/layers/attn_blocks.py  | 3 ++-
 .../cv/video_single_object_tracking/models/layers/head.py      | 3 ++-
 .../video_single_object_tracking/models/layers/patch_embed.py  | 3 ++-
 .../models/ostrack/base_backbone.py                            | 3 ++-
 .../cv/video_single_object_tracking/models/ostrack/ostrack.py  | 3 ++-
 .../cv/video_single_object_tracking/models/ostrack/utils.py    | 3 ++-
 .../cv/video_single_object_tracking/models/ostrack/vit_ce.py   | 3 ++-
 .../models/cv/video_single_object_tracking/tracker/ostrack.py  | 3 ++-
 .../models/cv/video_single_object_tracking/utils/utils.py      | 3 ++-
 .../pipelines/cv/video_single_object_tracking_pipeline.py      | 1 +
 12 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
index 8be07928..6805c503 100644
--- a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from easydict import EasyDict as edict
 
 cfg = edict()
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
index 00eb7e1c..e245c821 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
index 3505d5e1..702c84f1 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
index 77706dbc..e0dc7b59 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
index b1099fdf..c001663f 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
index de3a7b83..20d73422 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
index 40ed54f1..52704a6c 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 from torch import nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
index e1130069..46e7c18a 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
index 9f010332..f186cf89 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from functools import partial
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
index 02f4c79e..5093a72d 100644
--- a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 51911957..752ec272 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 from typing import Optional
 
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
index f4ba4d0b..c47fc15f 100644
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 

From c2b1ff8389887c056b866347654a56c05e99ab81 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 21 Sep 2022 14:25:06 +0800
Subject: [PATCH 123/175] [to #42322933] Add exporter module for onnx,ts and
 other formats.

1. Add exporter module
2. Move collate_fn out of the base pipeline class for reusing.
3. Add dummy inputs method in nlp tokenization preprocessor base class
4. Support Mapping in tensor numpify and detaching.
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10037704
---
 modelscope/exporters/__init__.py              |   4 +
 modelscope/exporters/base.py                  |  53 ++++
 modelscope/exporters/builder.py               |  21 ++
 modelscope/exporters/nlp/__init__.py          |   2 +
 ...rt_for_sequence_classification_exporter.py |  81 ++++++
 modelscope/exporters/torch_model_exporter.py  | 247 ++++++++++++++++++
 modelscope/pipelines/base.py                  |  81 +++---
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/regress_test_utils.py        |  18 +-
 modelscope/utils/tensor_utils.py              |  22 ++
 tests/export/__init__.py                      |   0
 ...st_export_sbert_sequence_classification.py |  37 +++
 12 files changed, 520 insertions(+), 47 deletions(-)
 create mode 100644 modelscope/exporters/__init__.py
 create mode 100644 modelscope/exporters/base.py
 create mode 100644 modelscope/exporters/builder.py
 create mode 100644 modelscope/exporters/nlp/__init__.py
 create mode 100644 modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
 create mode 100644 modelscope/exporters/torch_model_exporter.py
 create mode 100644 tests/export/__init__.py
 create mode 100644 tests/export/test_export_sbert_sequence_classification.py

diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
new file mode 100644
index 00000000..a597114f
--- /dev/null
+++ b/modelscope/exporters/__init__.py
@@ -0,0 +1,4 @@
+from .base import Exporter
+from .builder import build_exporter
+from .nlp import SbertForSequenceClassificationExporter
+from .torch_model_exporter import TorchModelExporter
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
new file mode 100644
index 00000000..f19d2bbb
--- /dev/null
+++ b/modelscope/exporters/base.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from abc import ABC, abstractmethod
+
+from modelscope.models import Model
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from .builder import build_exporter
+
+
+class Exporter(ABC):
+    """Exporter base class to output model to onnx, torch_script, graphdef, etc.
+    """
+
+    def __init__(self):
+        self.model = None
+
+    @classmethod
+    def from_model(cls, model: Model, **kwargs):
+        """Build the Exporter instance.
+
+        @param model: A model instance. it will be used to output the generated file,
+            and the configuration.json in its model_dir field will be used to create the exporter instance.
+        @param kwargs: Extra kwargs used to create the Exporter instance.
+        @return: The Exporter instance
+        """
+        cfg = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        task_name = cfg.task
+        model_cfg = cfg.model
+        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
+            model_cfg.type = model_cfg.model_type
+        export_cfg = ConfigDict({'type': model_cfg.type})
+        if hasattr(cfg, 'export'):
+            export_cfg.update(cfg.export)
+        exporter = build_exporter(export_cfg, task_name, kwargs)
+        exporter.model = model
+        return exporter
+
+    @abstractmethod
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        pass
diff --git a/modelscope/exporters/builder.py b/modelscope/exporters/builder.py
new file mode 100644
index 00000000..90699c12
--- /dev/null
+++ b/modelscope/exporters/builder.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import ConfigDict
+from modelscope.utils.registry import Registry, build_from_cfg
+
+EXPORTERS = Registry('exporters')
+
+
+def build_exporter(cfg: ConfigDict,
+                   task_name: str = None,
+                   default_args: dict = None):
+    """ build exporter by the given model config dict
+
+    Args:
+        cfg (:obj:`ConfigDict`): config dict for exporter object.
+        task_name (str, optional):  task name, refer to
+            :obj:`Tasks` for more details
+        default_args (dict, optional): Default initialization arguments.
+    """
+    return build_from_cfg(
+        cfg, EXPORTERS, group_key=task_name, default_args=default_args)
diff --git a/modelscope/exporters/nlp/__init__.py b/modelscope/exporters/nlp/__init__.py
new file mode 100644
index 00000000..fdfd2711
--- /dev/null
+++ b/modelscope/exporters/nlp/__init__.py
@@ -0,0 +1,2 @@
+from .sbert_for_sequence_classification_exporter import \
+    SbertForSequenceClassificationExporter
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
new file mode 100644
index 00000000..dc1e2b92
--- /dev/null
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -0,0 +1,81 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Mapping, Tuple
+
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.torch_model_exporter import TorchModelExporter
+from modelscope.metainfo import Models
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModeKeys, Tasks
+
+
+@EXPORTERS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@EXPORTERS.register_module(Tasks.nli, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassificationExporter(TorchModelExporter):
+
+    def generate_dummy_inputs(self,
+                              shape: Tuple = None,
+                              **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+
+        @param shape: A tuple of input shape which should have at most two dimensions.
+            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
+            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+        @return: Dummy inputs.
+        """
+
+        cfg = Config.from_file(
+            os.path.join(self.model.model_dir, 'configuration.json'))
+        field_name = Tasks.find_field_by_task(cfg.task)
+        if 'type' not in cfg.preprocessor and 'val' in cfg.preprocessor:
+            cfg = cfg.preprocessor.val
+        else:
+            cfg = cfg.preprocessor
+
+        batch_size = 1
+        sequence_length = {}
+        if shape is not None:
+            if len(shape) == 1:
+                batch_size = shape[0]
+            elif len(shape) == 2:
+                batch_size, max_length = shape
+                sequence_length = {'sequence_length': max_length}
+
+        cfg.update({
+            'model_dir': self.model.model_dir,
+            'mode': ModeKeys.TRAIN,
+            **sequence_length
+        })
+        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
+        if preprocessor.pair:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = preprocessor.tokenizer.unk_token
+        else:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = None
+
+        batched = []
+        for _ in range(batch_size):
+            batched.append(preprocessor((first_sequence, second_sequence)))
+        return default_collate(batched)
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        return OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict({'logits': {0: 'batch'}})
diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
new file mode 100644
index 00000000..98a23fe5
--- /dev/null
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -0,0 +1,247 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from contextlib import contextmanager
+from itertools import chain
+from typing import Any, Dict, Mapping
+
+import torch
+from torch import nn
+from torch.onnx import export as onnx_export
+from torch.onnx.utils import _decide_input_format
+
+from modelscope.models import TorchModel
+from modelscope.pipelines.base import collate_fn
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.regress_test_utils import compare_arguments_nested
+from modelscope.utils.tensor_utils import torch_nested_numpify
+from .base import Exporter
+
+logger = get_logger(__name__)
+
+
+class TorchModelExporter(Exporter):
+    """The torch base class of exporter.
+
+    This class provides the default implementations for exporting onnx and torch script.
+    Each specific model may implement its own exporter by overriding the export_onnx/export_torch_script,
+    and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
+    """
+
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
+        @return: A dict containing the model key - model file path pairs.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
+        self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
+        return {'model': onnx_file}
+
+    def export_torch_script(self, outputs: str, **kwargs):
+        """Export the model as torch script files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
+        # generate ts by tracing
+        self._torch_export_torch_script(model, ts_file, **kwargs)
+        return {'model': ts_file}
+
+    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+        @return: Dummy inputs.
+        """
+        return None
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's input arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's output arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    def _torch_export_onnx(self,
+                           model: nn.Module,
+                           output: str,
+                           opset: int = 11,
+                           device: str = 'cpu',
+                           validation: bool = True,
+                           rtol: float = None,
+                           atol: float = None,
+                           **kwargs):
+        """Export the model to an onnx format file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param opset: The version of the ONNX operator set to use.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        inputs = self.inputs
+        outputs = self.outputs
+        if dummy_inputs is None or inputs is None or outputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs,inputs,outputs must be set.')
+
+        with torch.no_grad():
+            model.eval()
+            device = torch.device(device)
+            model.to(device)
+            dummy_inputs = collate_fn(dummy_inputs, device)
+
+            if isinstance(dummy_inputs, Mapping):
+                dummy_inputs = dict(dummy_inputs)
+            onnx_outputs = list(self.outputs.keys())
+
+            with replace_call():
+                onnx_export(
+                    model,
+                    (dummy_inputs, ),
+                    f=output,
+                    input_names=list(inputs.keys()),
+                    output_names=onnx_outputs,
+                    dynamic_axes={
+                        name: axes
+                        for name, axes in chain(inputs.items(),
+                                                outputs.items())
+                    },
+                    do_constant_folding=True,
+                    opset_version=opset,
+                )
+
+        if validation:
+            try:
+                import onnx
+                import onnxruntime as ort
+            except ImportError:
+                logger.warn(
+                    'Cannot validate the exported onnx file, because '
+                    'the installation of onnx or onnxruntime cannot be found')
+                return
+            onnx_model = onnx.load(output)
+            onnx.checker.check_model(onnx_model)
+            ort_session = ort.InferenceSession(output)
+            with torch.no_grad():
+                model.eval()
+                outputs_origin = model.forward(
+                    *_decide_input_format(model, dummy_inputs))
+            if isinstance(outputs_origin, Mapping):
+                outputs_origin = torch_nested_numpify(
+                    list(outputs_origin.values()))
+            outputs = ort_session.run(
+                onnx_outputs,
+                torch_nested_numpify(dummy_inputs),
+            )
+
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested('Onnx model output match failed',
+                                            outputs, outputs_origin, **tols):
+                raise RuntimeError(
+                    'export onnx failed because of validation error.')
+
+    def _torch_export_torch_script(self,
+                                   model: nn.Module,
+                                   output: str,
+                                   device: str = 'cpu',
+                                   validation: bool = True,
+                                   rtol: float = None,
+                                   atol: float = None,
+                                   **kwargs):
+        """Export the model to a torch script file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        model.eval()
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        if dummy_inputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs must be set.')
+        dummy_inputs = collate_fn(dummy_inputs, device)
+        if isinstance(dummy_inputs, Mapping):
+            dummy_inputs = tuple(dummy_inputs.values())
+        with torch.no_grad():
+            model.eval()
+            with replace_call():
+                traced_model = torch.jit.trace(
+                    model, dummy_inputs, strict=False)
+        torch.jit.save(traced_model, output)
+
+        if validation:
+            ts_model = torch.jit.load(output)
+            with torch.no_grad():
+                model.eval()
+                ts_model.eval()
+                outputs = ts_model.forward(*dummy_inputs)
+                outputs = torch_nested_numpify(outputs)
+                outputs_origin = model.forward(*dummy_inputs)
+                outputs_origin = torch_nested_numpify(outputs_origin)
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested(
+                    'Torch script model output match failed', outputs,
+                    outputs_origin, **tols):
+                raise RuntimeError(
+                    'export torch script failed because of validation error.')
+
+
+@contextmanager
+def replace_call():
+    """This function is used to recover the original call method.
+
+    The Model class of modelscope overrides the call method. When exporting to onnx or torchscript, torch will
+    prepare the parameters as the prototype of forward method, and trace the call method, this causes
+    problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
+    back after the tracing was done.
+    """
+
+    TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
+    yield
+    TorchModel.__call__ = TorchModel.call_origin
+    del TorchModel.call_origin
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 5369220f..c5db2b57 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -28,7 +28,7 @@ if is_torch_available():
     import torch
 
 if is_tf_available():
-    import tensorflow as tf
+    pass
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
@@ -204,44 +204,7 @@ class Pipeline(ABC):
             yield self._process_single(ele, *args, **kwargs)
 
     def _collate_fn(self, data):
-        """Prepare the input just before the forward function.
-        This method will move the tensors to the right device.
-        Usually this method does not need to be overridden.
-
-        Args:
-            data: The data out of the dataloader.
-
-        Returns: The processed data.
-
-        """
-        from torch.utils.data.dataloader import default_collate
-        from modelscope.preprocessors import InputFeatures
-        if isinstance(data, dict) or isinstance(data, Mapping):
-            return type(data)(
-                {k: self._collate_fn(v)
-                 for k, v in data.items()})
-        elif isinstance(data, (tuple, list)):
-            if isinstance(data[0], (int, float)):
-                return default_collate(data).to(self.device)
-            else:
-                return type(data)(self._collate_fn(v) for v in data)
-        elif isinstance(data, np.ndarray):
-            if data.dtype.type is np.str_:
-                return data
-            else:
-                return self._collate_fn(torch.from_numpy(data))
-        elif isinstance(data, torch.Tensor):
-            return data.to(self.device)
-        elif isinstance(data, (bytes, str, int, float, bool, type(None))):
-            return data
-        elif isinstance(data, InputFeatures):
-            return data
-        else:
-            import mmcv
-            if isinstance(data, mmcv.parallel.data_container.DataContainer):
-                return data
-            else:
-                raise ValueError(f'Unsupported data type {type(data)}')
+        return collate_fn(data, self.device)
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
         preprocess_params = kwargs.get('preprocess_params', {})
@@ -410,3 +373,43 @@ class DistributedPipeline(Pipeline):
         @return: The forward results.
         """
         pass
+
+
+def collate_fn(data, device):
+    """Prepare the input just before the forward function.
+    This method will move the tensors to the right device.
+    Usually this method does not need to be overridden.
+
+    Args:
+        data: The data out of the dataloader.
+        device: The device to move data to.
+
+    Returns: The processed data.
+
+    """
+    from torch.utils.data.dataloader import default_collate
+    from modelscope.preprocessors import InputFeatures
+    if isinstance(data, dict) or isinstance(data, Mapping):
+        return type(data)({k: collate_fn(v, device) for k, v in data.items()})
+    elif isinstance(data, (tuple, list)):
+        if isinstance(data[0], (int, float)):
+            return default_collate(data).to(device)
+        else:
+            return type(data)(collate_fn(v, device) for v in data)
+    elif isinstance(data, np.ndarray):
+        if data.dtype.type is np.str_:
+            return data
+        else:
+            return collate_fn(torch.from_numpy(data), device)
+    elif isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, (bytes, str, int, float, bool, type(None))):
+        return data
+    elif isinstance(data, InputFeatures):
+        return data
+    else:
+        import mmcv
+        if isinstance(data, mmcv.parallel.data_container.DataContainer):
+            return data
+        else:
+            raise ValueError(f'Unsupported data type {type(data)}')
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 57d38da7..d6b0da40 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -246,6 +246,7 @@ class ModelFile(object):
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
     TRAIN_OUTPUT_DIR = 'output'
+    TS_MODEL_FILE = 'model.ts'
 
 
 class ConfigFields(object):
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 8b6c24a7..47bbadfe 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -352,10 +352,10 @@ def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
         return type(tensors)(
             numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
     if isinstance(tensors, Mapping):
-        return type(tensors)({
+        return {
             k: numpify_tensor_nested(t, reduction, clip_value)
             for k, t in tensors.items()
-        })
+        }
     if isinstance(tensors, torch.Tensor):
         t: np.ndarray = tensors.cpu().numpy()
         if clip_value is not None:
@@ -375,9 +375,7 @@ def detach_tensor_nested(tensors):
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(detach_tensor_nested(t) for t in tensors)
     if isinstance(tensors, Mapping):
-        return type(tensors)(
-            {k: detach_tensor_nested(t)
-             for k, t in tensors.items()})
+        return {k: detach_tensor_nested(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
@@ -496,7 +494,11 @@ def intercept_module(module: nn.Module,
         intercept_module(module, io_json, full_name, restore)
 
 
-def compare_arguments_nested(print_content, arg1, arg2):
+def compare_arguments_nested(print_content,
+                             arg1,
+                             arg2,
+                             rtol=1.e-3,
+                             atol=1.e-8):
     type1 = type(arg1)
     type2 = type(arg2)
     if type1.__name__ != type2.__name__:
@@ -515,7 +517,7 @@ def compare_arguments_nested(print_content, arg1, arg2):
             return False
         return True
     elif isinstance(arg1, (float, np.floating)):
-        if not np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8, equal_nan=True):
+        if not np.isclose(arg1, arg2, rtol=rtol, atol=atol, equal_nan=True):
             if print_content is not None:
                 print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
             return False
@@ -562,7 +564,7 @@ def compare_arguments_nested(print_content, arg1, arg2):
         arg2 = np.where(np.equal(arg2, None), np.NaN,
                         arg2).astype(dtype=np.float)
         if not all(
-                np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8,
+                np.isclose(arg1, arg2, rtol=rtol, atol=atol,
                            equal_nan=True).flatten()):
             if print_content is not None:
                 print(f'{print_content}')
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index b438e476..b68a639c 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,12 +1,24 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
+from collections import Mapping
 
 
 def torch_nested_numpify(tensors):
+    """ Numpify nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The numpify tensors.
+    """
+
     import torch
     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_numpify(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        # return dict
+        return {k: torch_nested_numpify(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         t = tensors.cpu()
         return t.numpy()
@@ -14,10 +26,20 @@ def torch_nested_numpify(tensors):
 
 
 def torch_nested_detach(tensors):
+    """ Detach nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The detached tensors.
+    """
+
     import torch
     "Detach `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_detach(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return {k: torch_nested_detach(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
diff --git a/tests/export/__init__.py b/tests/export/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
new file mode 100644
index 00000000..535b3f5d
--- /dev/null
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.exporters import Exporter, TorchModelExporter
+from modelscope.models.base import Model
+from modelscope.utils.test_utils import test_level
+
+
+class TestExportSbertSequenceClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_export_sbert_sequence_classification(self):
+        model = Model.from_pretrained(self.model_id)
+        print(
+            Exporter.from_model(model).export_onnx(
+                shape=(2, 256), outputs=self.tmp_dir))
+        print(
+            TorchModelExporter.from_model(model).export_torch_script(
+                shape=(2, 256), outputs=self.tmp_dir))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1794e08af743fcfddf88fcee07b883b48728c515 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Wed, 21 Sep 2022 17:47:50 +0800
Subject: [PATCH 124/175] fix dist training         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10185634

    * fix dist training
---
 modelscope/trainers/trainer.py         | 30 +++++++++++++++--------
 modelscope/trainers/utils/inference.py |  9 ++++---
 modelscope/utils/torch_utils.py        |  4 +++
 tests/trainers/test_trainer_gpu.py     | 34 ++++++++++++++++++++++++--
 4 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 69645d07..d3675720 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -37,8 +37,8 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import (get_dist_info, init_dist,
-                                          set_random_seed)
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -155,8 +155,17 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_preprocessor is not None:
             self.eval_preprocessor.mode = ModeKeys.EVAL
 
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
         device_name = kwargs.get('device', 'gpu')
-        verify_device(device_name)
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
         self.device = create_device(device_name)
 
         self.train_dataset = self.to_task_dataset(
@@ -219,11 +228,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.use_fp16 = kwargs.get('use_fp16', False)
 
-        if kwargs.get('launcher', None) is not None:
-            init_dist(kwargs['launcher'])
-
-        self._dist = get_dist_info()[1] > 1
-
         # model placement
         if self.device.type == 'cuda':
             self.model.to(self.device)
@@ -531,8 +535,14 @@ class EpochBasedTrainer(BaseTrainer):
         model.train()
         self._mode = ModeKeys.TRAIN
         # call model forward but not __call__ to skip postprocess
-        if isinstance(inputs,
-                      Mapping) and not func_receive_dict_inputs(model.forward):
+
+        if is_parallel(model):
+            receive_dict_inputs = func_receive_dict_inputs(
+                model.module.forward)
+        else:
+            receive_dict_inputs = func_receive_dict_inputs(model.forward)
+
+        if isinstance(inputs, Mapping) and not receive_dict_inputs:
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index d368c340..7f5d4ec3 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -11,6 +11,7 @@ import torch
 from torch import distributed as dist
 from tqdm import tqdm
 
+from modelscope.trainers.parallel.utils import is_parallel
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
@@ -134,7 +135,10 @@ def multi_gpu_test(model,
         data_len = data_loader_iters_per_gpu * world_size
         desc = 'Total test iterations with multi gpus'
 
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    if is_parallel(model):
+        receive_dict_inputs = func_receive_dict_inputs(model.module.forward)
+    else:
+        receive_dict_inputs = func_receive_dict_inputs(model.forward)
 
     count = 0
     with tqdm(total=data_len, desc=desc) as pbar:
@@ -142,8 +146,7 @@ def multi_gpu_test(model,
             data = to_device(data, device)
             data_list.append(data)
             with torch.no_grad():
-                if isinstance(data, Mapping) and not func_receive_dict_inputs(
-                        model.forward):
+                if isinstance(data, Mapping) and not receive_dict_inputs:
                     result = model.forward(**data)
                 else:
                     result = model.forward(data)
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 6d4132f6..74d9bb7b 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -115,6 +115,10 @@ def get_dist_info() -> Tuple[int, int]:
     return rank, world_size
 
 
+def get_local_rank():
+    return int(os.environ.get('LOCAL_RANK', 0))
+
+
 def is_master():
     rank, _ = get_dist_info()
     return rank == 0
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 1f622287..0176704a 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -53,7 +53,18 @@ class DummyModel(nn.Module, Model):
         return dict(logits=x, loss=loss)
 
 
-def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
+class DummyModelForwardInputs(DummyModel):
+
+    def forward(self, inputs):
+        feat, labels = inputs['feat'], inputs['labels']
+        return super().forward(feat, labels)
+
+
+def train_func(work_dir,
+               dist=False,
+               iterable_dataset=False,
+               forward_inputs=False,
+               **kwargs):
     json_cfg = {
         'task': Tasks.image_classification,
         'train': {
@@ -81,7 +92,10 @@ def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     with open(config_path, 'w') as f:
         json.dump(json_cfg, f)
 
-    model = DummyModel()
+    if forward_inputs:
+        model = DummyModelForwardInputs()
+    else:
+        model = DummyModel()
     optimmizer = SGD(model.parameters(), lr=0.01)
     lr_scheduler = StepLR(optimmizer, 2)
     trainer_name = Trainers.default
@@ -273,6 +287,22 @@ class TrainerTestMultiGpus(DistributedTestCase):
         for i in [1, 3, 5]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_forward_inputs(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            forward_inputs=True)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
+
     # TODO: support iters_per_epoch for dist mode
     @unittest.skipIf(True, 'need to adapt to DistributedSampler')
     def test_multi_gpus_with_iters_per_epoch(self):

From b537bb8c270bef36b1ea5d8f0c8c3e2df67aff9d Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Wed, 21 Sep 2022 18:57:34 +0800
Subject: [PATCH 125/175] fix vg return value         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10207239

---
 modelscope/models/multi_modal/ofa_for_all_tasks.py |  8 ++++----
 tests/pipelines/test_ofa_tasks.py                  | 10 ++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 05950378..45bafde9 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -112,8 +112,6 @@ class OfaForAllTasks(TorchModel):
                 OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
                 OutputKeys.LABELS, OutputKeys.SCORES
         ]:
-            if key in ret and len(ret[key]) == 1:
-                ret[key] = ret[key][0]
             if key not in ret:
                 ret[key] = None
         return ret
@@ -121,8 +119,10 @@ class OfaForAllTasks(TorchModel):
     def postprocess(self, input: Dict[str, Tensor],
                     **kwargs) -> Dict[str, Tensor]:
         if self.cfg.task == Tasks.image_captioning:
-            caption = input[OutputKeys.CAPTION]
-            caption = caption.translate(self.transtab).strip()
+            caption = [
+                cap.translate(self.transtab).strip()
+                for cap in input[OutputKeys.CAPTION]
+            ]
             input[OutputKeys.CAPTION] = caption
         return input
 
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 9a72d1ff..e6638dfa 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -147,8 +147,10 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
-                      osp.join('large_en_model_' + image_name + '.png'))
+        self.save_img(
+            image,
+            result[OutputKeys.BOXES][0],  # just one box
+            osp.join('large_en_model_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_grounding_with_name(self):
@@ -161,7 +163,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_en_name_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -174,7 +176,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-1]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_zh_name_' + image_name))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')

From 4e00a325ad5df4ff128a4b5180b889f015bce9a8 Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Wed, 21 Sep 2022 19:42:07 +0800
Subject: [PATCH 126/175] [to #42322933] add license         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10202082

---
 modelscope/pipelines/cv/action_detection_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py
index 72335d5b..74d1862e 100644
--- a/modelscope/pipelines/cv/action_detection_pipeline.py
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os.path as osp
 from typing import Any, Dict

From 01c8735efa2ca5fa4121aecbd4473a83a2d6daec Mon Sep 17 00:00:00 2001
From: "lingcai.wl" <lingcai.wl@alibaba-inc.com>
Date: Thu, 22 Sep 2022 09:24:30 +0800
Subject: [PATCH 127/175] [to #44657982] update demo utils         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10208690

---
 .../cv/image_style_transfer_pipeline.py       |  8 ++-
 modelscope/utils/demo_utils.py                | 62 ++++++++++++++++---
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/modelscope/pipelines/cv/image_style_transfer_pipeline.py b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
index 827a0d44..64e67115 100644
--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -61,7 +61,13 @@ class ImageStyleTransferPipeline(Pipeline):
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, {}, {}
 
-    def preprocess(self, content: Input, style: Input) -> Dict[str, Any]:
+    def preprocess(self,
+                   content: Input,
+                   style: Input = None) -> Dict[str, Any]:
+        if type(content) is dict:  # for demo service
+            style = content['style']
+            content = content['content']
+
         content = LoadImage.convert_to_ndarray(content)
         if len(content.shape) == 2:
             content = cv2.cvtColor(content, cv2.COLOR_GRAY2BGR)
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 41ac0bca..624c7c5a 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -123,7 +123,7 @@ INPUT_EXAMPLES = {
         'urlPaths': {
             'outUrls': [{
                 'outputKey': OutputKeys.OUTPUT_PCM,
-                'fileType': 'wav'
+                'fileType': 'pcm'
             }]
         }
     },
@@ -134,7 +134,7 @@ INPUT_EXAMPLES = {
         'urlPaths': {
             'outUrls': [{
                 'outputKey': OutputKeys.OUTPUT_PCM,
-                'fileType': 'wav'
+                'fileType': 'pcm'
             }]
         }
     },
@@ -147,7 +147,13 @@ INPUT_EXAMPLES = {
             'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
             'a blue turtle-like pokemon with round head'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }]
+        }
     },
     TasksIODescriptions.visual_question_answering: {
         'task':
@@ -156,7 +162,16 @@ INPUT_EXAMPLES = {
             'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
             'what is grown on the plant?'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{
+                'outputKey': 'text'
+            }]
+        }
     },
     TasksIODescriptions.visual_entailment: {
         'task':
@@ -165,7 +180,14 @@ INPUT_EXAMPLES = {
             'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
             'there are two birds.', 'test'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
     },
     TasksIODescriptions.generative_multi_modal_embedding: {
         'task':
@@ -174,7 +196,14 @@ INPUT_EXAMPLES = {
             'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
             'dogs playing in the grass'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
     },
 }
 
@@ -192,7 +221,13 @@ class DemoCompatibilityCheck(object):
         print('testing demo: ', self.task, self.model_id)
         test_pipline = pipeline(self.task, self.model_id)
         req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
-        output = test_pipline(preprocess(req))
+        inputs = preprocess(req)
+        params = req.get('parameters', {})
+        # maas inference
+        if params != {}:
+            output = test_pipline(inputs, **params)
+        else:
+            output = test_pipline(inputs)
         json.dumps(output, cls=NumpyEncoder)
         result = postprocess(req, output)
         print(result)
@@ -215,11 +250,21 @@ class NumpyEncoder(json.JSONEncoder):
 
 
 def preprocess(req):
+    in_urls = req.get('urlPaths').get('inUrls')
     if len(req['inputs']) == 1:
         inputs = req['inputs'][0]
     else:
         inputs = tuple(req['inputs'])
-    return inputs
+    if in_urls is None or len(in_urls) == 0:
+        return inputs
+
+    inputs_dict = {}
+    for i, in_url in enumerate(in_urls):
+        input_name = in_url.get('name')
+        if input_name is None or input_name == '':
+            return inputs
+        inputs_dict[input_name] = req['inputs'][i]
+    return inputs_dict
 
 
 def postprocess(req, resp):
@@ -242,4 +287,3 @@ def postprocess(req, resp):
             out_mem_file = io.BytesIO()
             out_mem_file.write(new_resp.get(output_key))
             return type(out_mem_file)
-        # TODO(lingcai.wl): support more file type

From 4bc188173ef1311e56ee39e438cb78d926738ef7 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Thu, 22 Sep 2022 09:29:41 +0800
Subject: [PATCH 128/175] [to #42322933] fix typo

---
 modelscope/utils/demo_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 624c7c5a..363ae950 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -223,7 +223,7 @@ class DemoCompatibilityCheck(object):
         req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
         inputs = preprocess(req)
         params = req.get('parameters', {})
-        # maas inference
+        # modelscope inference
         if params != {}:
             output = test_pipline(inputs, **params)
         else:

From 376ba9fef9e9661286c938aa679366f1d83c77a9 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 22 Sep 2022 15:15:29 +0800
Subject: [PATCH 129/175] [to #42322933]update easycv pipelines         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10208603

    * update easycv pipelines
---
 .../pipelines/cv/easycv_pipelines/base.py     | 11 +++--
 .../face_2d_keypoints_pipeline.py             | 12 +++---
 .../cv/hand_2d_keypoints_pipeline.py          |  5 ++-
 requirements/cv.txt                           |  2 +-
 .../test_segmentation_pipeline.py             | 43 ++++++++++++++-----
 tests/pipelines/test_face_2d_keypoints.py     |  2 +-
 tests/run_config.yaml                         |  1 +
 7 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
index d6495f0a..8aea1146 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -10,6 +10,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.device import create_device
 
 
 class EasyCVPipeline(object):
@@ -53,16 +54,19 @@ class EasyCVPipeline(object):
         ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'
 
         self.cfg = Config.from_file(self.config_file)
-        self.predict_op = self._build_predict_op()
+        if 'device' in kwargs:
+            kwargs['device'] = create_device(kwargs['device'])
+        self.predict_op = self._build_predict_op(**kwargs)
 
-    def _build_predict_op(self):
+    def _build_predict_op(self, **kwargs):
         """Build EasyCV predictor."""
         from easycv.predictors.builder import build_predictor
 
         easycv_config = self._to_easycv_config()
         pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
             'model_path': self.model_path,
-            'config_file': easycv_config
+            'config_file': easycv_config,
+            **kwargs
         })
         return pipeline_op
 
@@ -91,5 +95,4 @@ class EasyCVPipeline(object):
         return easycv_config
 
     def __call__(self, inputs) -> Any:
-        # TODO: support image url
         return self.predict_op(inputs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index eb4d6c15..7c32e0fc 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -4,7 +4,6 @@ from typing import Any
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from .base import EasyCVPipeline
 
@@ -34,8 +33,11 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
         return self.predict_op.show_result(img, points, scale, save_path)
 
     def __call__(self, inputs) -> Any:
-        output = self.predict_op(inputs)[0][0]
-        points = output['point']
-        poses = output['pose']
+        outputs = self.predict_op(inputs)
 
-        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
+        results = [{
+            OutputKeys.KEYPOINTS: output['point'],
+            OutputKeys.POSES: output['pose']
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
index db66f5d2..bad0c652 100644
--- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -28,7 +28,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
             *args,
             **kwargs)
 
-    def _build_predict_op(self):
+    def _build_predict_op(self, **kwargs):
         """Build EasyCV predictor."""
         from easycv.predictors.builder import build_predictor
         detection_predictor_type = self.cfg['DETECTION']['type']
@@ -46,6 +46,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
         easycv_config = self._to_easycv_config()
         pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
             'model_path': self.model_path,
-            'config_file': easycv_config
+            'config_file': easycv_config,
+            **kwargs
         })
         return pipeline_op
diff --git a/requirements/cv.txt b/requirements/cv.txt
index ebb61851..8c06242a 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,7 +14,7 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.6.0
+pai-easycv>=0.6.3.4
 pandas
 psutil
 regex
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index 6cfdacc6..db9c403a 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
+from distutils.version import LooseVersion
 
+import easycv
 import numpy as np
 from PIL import Image
 
-from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -24,38 +25,60 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
 
         results = outputs[0]
         self.assertListEqual(
-            list(img.shape)[:2], list(results['seg_pred'][0].shape))
-        self.assertListEqual(results['seg_pred'][0][1, 4:10].tolist(),
+            list(img.shape)[:2], list(results['seg_pred'].shape))
+        self.assertListEqual(results['seg_pred'][1, 4:10].tolist(),
                              [161 for i in range(6)])
-        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
+        self.assertListEqual(results['seg_pred'][-1, -10:].tolist(),
                              [133 for i in range(10)])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def _internal_test_batch(self, model_id, num_samples=2, batch_size=2):
+        # TODO: support in the future
+        img = np.asarray(Image.open(self.img_path))
+        num_samples = num_samples
+        batch_size = batch_size
+        semantic_seg = pipeline(
+            task=Tasks.image_segmentation,
+            model=model_id,
+            batch_size=batch_size)
+        outputs = semantic_seg([self.img_path] * num_samples)
+
+        self.assertEqual(semantic_seg.predict_op.batch_size, batch_size)
+        self.assertEqual(len(outputs), num_samples)
+
+        for output in outputs:
+            self.assertListEqual(
+                list(img.shape)[:2], list(output['seg_pred'].shape))
+            self.assertListEqual(output['seg_pred'][1, 4:10].tolist(),
+                                 [161 for i in range(6)])
+            self.assertListEqual(output['seg_pred'][-1, -10:].tolist(),
+                                 [133 for i in range(10)])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b0(self):
         model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b1(self):
         model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b2(self):
         model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b3(self):
         model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b4(self):
         model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b5(self):
         model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
         self._internal_test__(model_id)
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
index a5e347e8..667ecddc 100644
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         face_2d_keypoints_align = pipeline(
             task=Tasks.face_2d_keypoints, model=model_id)
-        output = face_2d_keypoints_align(img_path)
+        output = face_2d_keypoints_align(img_path)[0]
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.POSES]
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index fc983023..4c571b7f 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -9,6 +9,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_image_super_resolution.py
   - test_easycv_trainer.py
   - test_segformer.py
+  - test_segmentation_pipeline.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.

From 1c66f2a9d7723e3f90cc12f7dffc7402d0d7d87e Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Thu, 22 Sep 2022 18:08:52 +0800
Subject: [PATCH 130/175]  [to #44902165] bump version to 0.4.4

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 908c0bb7..9a8e054a 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.3'
+__version__ = '0.4.4'

From dc2cf3c2dc397c95b5b7db7b238feee327c5874c Mon Sep 17 00:00:00 2001
From: "hejunjie.hjj" <hejunjie.hjj@alibaba-inc.com>
Date: Thu, 22 Sep 2022 22:54:00 +0800
Subject: [PATCH 131/175] [to #42322933] add license header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10231619

---
 modelscope/metrics/image_instance_segmentation_metric.py | 2 ++
 .../backbones/swin_transformer.py                        | 4 ++--
 .../cascade_mask_rcnn_swin.py                            | 2 ++
 .../cv/image_instance_segmentation/datasets/__init__.py  | 1 +
 .../image_instance_segmentation/datasets/transforms.py   | 9 +++++----
 .../models/cv/image_instance_segmentation/model.py       | 1 +
 .../cv/image_instance_segmentation/postprocess_utils.py  | 2 ++
 .../image_instance_segmentation_coco_dataset.py          | 2 ++
 .../pipelines/cv/image_instance_segmentation_pipeline.py | 1 +
 .../trainers/cv/image_instance_segmentation_trainer.py   | 1 +
 10 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/modelscope/metrics/image_instance_segmentation_metric.py b/modelscope/metrics/image_instance_segmentation_metric.py
index 7deafbce..86a19d13 100644
--- a/modelscope/metrics/image_instance_segmentation_metric.py
+++ b/modelscope/metrics/image_instance_segmentation_metric.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 import tempfile
 from collections import OrderedDict
diff --git a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
index 3e7609e1..2007688d 100644
--- a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
@@ -1,5 +1,5 @@
-# Modified from: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
-
+# The implementation is adopted from Swin Transformer, made publicly available under the MIT License at
+# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
index 30e70f82..ff83271e 100644
--- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
 import os
 from collections import OrderedDict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
index cca1432f..1b096fb3 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .transforms import build_preprocess_transform
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
index c2c11286..f0dde759 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
@@ -51,9 +52,9 @@ class LoadImageFromFile:
     """Load an image from file.
 
     Required keys are "img_prefix" and "img_info" (a dict that must contain the
-    key "filename"). Added or updated keys are "filename", "img", "img_shape",
-    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
-    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+    key "filename", "ann_file", and "classes"). Added or updated keys are
+    "filename", "ori_filename", "img", "img_shape", "ori_shape" (same as `img_shape`),
+    "img_fields", "ann_file" (path to annotation file) and "classes".
 
     Args:
         to_float32 (bool): Whether to convert the loaded image to a float32
@@ -73,7 +74,7 @@ class LoadImageFromFile:
         """Call functions to load image and get image meta information.
 
         Args:
-            results (dict): Result dict from :obj:`ImageInstanceSegmentationDataset`.
+            results (dict): Result dict from :obj:`ImageInstanceSegmentationCocoDataset`.
 
         Returns:
             dict: The dict contains loaded image and meta information.
diff --git a/modelscope/models/cv/image_instance_segmentation/model.py b/modelscope/models/cv/image_instance_segmentation/model.py
index 2be59623..a56a1608 100644
--- a/modelscope/models/cv/image_instance_segmentation/model.py
+++ b/modelscope/models/cv/image_instance_segmentation/model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 531e2efd..6058cd73 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/visualization/image.py
 import itertools
 
 import cv2
diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index 10cf7bfb..1c7bc249 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
index ce0bf907..5a0f0d7e 100644
--- a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
index 2e2415dc..a777bde1 100644
--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer

From f4044f14fd23c6edb8e8051426c99381bd53d24a Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Thu, 22 Sep 2022 22:54:59 +0800
Subject: [PATCH 132/175] cv/cvdet_lic_add file licence         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10214336

---
 modelscope/models/cv/object_detection/mmdet_model.py          | 1 +
 modelscope/models/cv/object_detection/mmdet_ms/__init__.py    | 2 ++
 .../models/cv/object_detection/mmdet_ms/backbones/__init__.py | 2 ++
 .../cv/object_detection/mmdet_ms/dense_heads/__init__.py      | 2 ++
 .../cv/object_detection/mmdet_ms/dense_heads/anchor_head.py   | 3 ++-
 .../cv/object_detection/mmdet_ms/dense_heads/rpn_head.py      | 3 ++-
 .../models/cv/object_detection/mmdet_ms/necks/__init__.py     | 2 ++
 modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py   | 3 ++-
 .../models/cv/object_detection/mmdet_ms/roi_heads/__init__.py | 2 ++
 .../mmdet_ms/roi_heads/bbox_heads/__init__.py                 | 2 ++
 .../mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py         | 3 ++-
 .../mmdet_ms/roi_heads/mask_heads/__init__.py                 | 2 ++
 .../mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py            | 3 ++-
 .../models/cv/object_detection/mmdet_ms/utils/__init__.py     | 2 ++
 .../models/cv/object_detection/mmdet_ms/utils/checkpoint.py   | 3 ++-
 .../cv/object_detection/mmdet_ms/utils/convModule_norm.py     | 4 ++--
 modelscope/models/cv/salient_detection/models/__init__.py     | 2 ++
 modelscope/models/cv/salient_detection/models/u2net.py        | 3 ++-
 modelscope/models/cv/salient_detection/salient_model.py       | 1 +
 19 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/modelscope/models/cv/object_detection/mmdet_model.py b/modelscope/models/cv/object_detection/mmdet_model.py
index 7bf81349..485d440a 100644
--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
index 2e47ce76..3a1fdd0b 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .backbones import ViT
 from .dense_heads import AnchorNHead, RPNNHead
 from .necks import FPNF
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
index 3b34dad6..c0697d48 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .vit import ViT
 
 __all__ = ['ViT']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
index 0fba8c00..0d34e996 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .anchor_head import AnchorNHead
 from .rpn_head import RPNNHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
index b4114652..d4ea5282 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmdet.models.builder import HEADS
 from mmdet.models.dense_heads import AnchorHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
index f53368ce..8e934a5c 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import copy
 
 import torch
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
index 5b0b6210..d164987e 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fpn import FPNF
 
 __all__ = ['FPNF']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
index 52529b28..5f8648ce 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.runner import BaseModule, auto_fp16
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
index a6be3775..658280df 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .bbox_heads import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                          Shared4Conv1FCBBoxNHead)
 from .mask_heads import FCNMaskNHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
index 0d4d5b6b..61d93503 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .convfc_bbox_head import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                                Shared4Conv1FCBBoxNHead)
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
index d2e04b80..726329a1 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 from mmdet.models.builder import HEADS
 from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
index 8f816850..043e62a0 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fcn_mask_head import FCNMaskNHead
 
 __all__ = ['FCNMaskNHead']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
index e5aedc98..335f6b8f 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from warnings import warn
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
index 971a0232..34f240c6 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .checkpoint import load_checkpoint
 from .convModule_norm import ConvModule_Norm
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
index 593af1cc..7833f592 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
@@ -1,5 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import io
 import os
 import os.path as osp
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
index d81c24e1..a15780f7 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
@@ -1,5 +1,5 @@
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
-
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmcv.cnn import ConvModule
 
 
diff --git a/modelscope/models/cv/salient_detection/models/__init__.py b/modelscope/models/cv/salient_detection/models/__init__.py
index 0850c33d..8ea7a5d3 100644
--- a/modelscope/models/cv/salient_detection/models/__init__.py
+++ b/modelscope/models/cv/salient_detection/models/__init__.py
@@ -1 +1,3 @@
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 from .u2net import U2NET
diff --git a/modelscope/models/cv/salient_detection/models/u2net.py b/modelscope/models/cv/salient_detection/models/u2net.py
index 0a0a4511..05dbf7ad 100644
--- a/modelscope/models/cv/salient_detection/models/u2net.py
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -1,4 +1,5 @@
-# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
index 539d1f24..6e617f58 100644
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import cv2

From 470a1989bcaf6aa4c7050926f33d1d431b6e9233 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 22 Sep 2022 23:01:14 +0800
Subject: [PATCH 133/175] [to #42322933] feat: far field KWS accept mono audio
 for online demo         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10211100

---
 data/test/audios/1ch_nihaomiya.wav            |  3 ++
 .../pipelines/audio/kws_farfield_pipeline.py  | 41 ++++++++++---------
 .../test_key_word_spotting_farfield.py        | 11 +++++
 3 files changed, 36 insertions(+), 19 deletions(-)
 create mode 100644 data/test/audios/1ch_nihaomiya.wav

diff --git a/data/test/audios/1ch_nihaomiya.wav b/data/test/audios/1ch_nihaomiya.wav
new file mode 100644
index 00000000..4618d412
--- /dev/null
+++ b/data/test/audios/1ch_nihaomiya.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f7f5a0a4efca1e83463cb44460c66b56fb7cd673eb6da37924637bc05ef758d
+size 1440044
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index 62f58fee..e2f618fa 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -4,6 +4,9 @@ import io
 import wave
 from typing import Any, Dict
 
+import numpy
+import soundfile as sf
+
 from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
@@ -37,7 +40,6 @@ class KWSFarfieldPipeline(Pipeline):
         self.model.eval()
         frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
         self._nframe = self.model.size_in // frame_size
-        self.frame_count = 0
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
@@ -54,35 +56,36 @@ class KWSFarfieldPipeline(Pipeline):
         input_file = inputs['input_file']
         if isinstance(input_file, str):
             input_file = File.read(input_file)
-        if isinstance(input_file, bytes):
-            input_file = io.BytesIO(input_file)
-        self.frame_count = 0
+        frames, samplerate = sf.read(io.BytesIO(input_file), dtype='int16')
+        if len(frames.shape) == 1:
+            frames = numpy.stack((frames, frames, numpy.zeros_like(frames)), 1)
+
         kws_list = []
-        with wave.open(input_file, 'rb') as fin:
-            if 'output_file' in inputs:
-                with wave.open(inputs['output_file'], 'wb') as fout:
-                    fout.setframerate(self.SAMPLE_RATE)
-                    fout.setnchannels(self.OUTPUT_CHANNELS)
-                    fout.setsampwidth(self.SAMPLE_WIDTH)
-                    self._process(fin, kws_list, fout)
-            else:
-                self._process(fin, kws_list)
+        if 'output_file' in inputs:
+            with wave.open(inputs['output_file'], 'wb') as fout:
+                fout.setframerate(self.SAMPLE_RATE)
+                fout.setnchannels(self.OUTPUT_CHANNELS)
+                fout.setsampwidth(self.SAMPLE_WIDTH)
+                self._process(frames, kws_list, fout)
+        else:
+            self._process(frames, kws_list)
         return {OutputKeys.KWS_LIST: kws_list}
 
     def _process(self,
-                 fin: wave.Wave_read,
+                 frames: numpy.ndarray,
                  kws_list,
                  fout: wave.Wave_write = None):
-        data = fin.readframes(self._nframe)
-        while len(data) >= self.model.size_in:
-            self.frame_count += self._nframe
+        for start_index in range(0, frames.shape[0], self._nframe):
+            end_index = start_index + self._nframe
+            if end_index > frames.shape[0]:
+                end_index = frames.shape[0]
+            data = frames[start_index:end_index, :].tobytes()
             result = self.model.forward_decode(data)
             if fout:
                 fout.writeframes(result['pcm'])
             if 'kws' in result:
-                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
+                result['kws']['offset'] += start_index / self.SAMPLE_RATE
                 kws_list.append(result['kws'])
-            data = fin.readframes(self._nframe)
 
     def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         return inputs
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index fea7afd7..f8c167de 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -8,6 +8,7 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+TEST_SPEECH_FILE_MONO = 'data/test/audios/1ch_nihaomiya.wav'
 TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
                   'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
                   '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
@@ -26,6 +27,16 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_mono(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        inputs = {
+            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
+        }
+        result = kws(inputs)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_url(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)

From a22a4e9a3aee858c0a34fb037c8d26e83d9a1a15 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Fri, 23 Sep 2022 09:38:55 +0800
Subject: [PATCH 134/175] [to #42322933]add license header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10222223

---
 modelscope/metrics/movie_scene_segmentation_metric.py     | 2 ++
 modelscope/models/cv/movie_scene_segmentation/model.py    | 3 +++
 .../models/cv/movie_scene_segmentation/utils/__init__.py  | 1 +
 .../models/cv/movie_scene_segmentation/utils/head.py      | 8 ++------
 .../models/cv/movie_scene_segmentation/utils/save_op.py   | 6 ++----
 .../cv/movie_scene_segmentation/utils/shot_encoder.py     | 4 +---
 .../task_datasets/movie_scene_segmentation/__init__.py    | 1 +
 .../movie_scene_segmentation_dataset.py                   | 5 ++---
 .../pipelines/cv/movie_scene_segmentation_pipeline.py     | 1 +
 .../preprocessors/movie_scene_segmentation/__init__.py    | 1 +
 .../preprocessors/movie_scene_segmentation/transforms.py  | 8 ++------
 .../trainers/cv/movie_scene_segmentation_trainer.py       | 1 +
 12 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
index 56bdbd1c..65725b6f 100644
--- a/modelscope/metrics/movie_scene_segmentation_metric.py
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -1,3 +1,5 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index e9576963..676b5ac1 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -1,3 +1,6 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
+
 import os
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
index 3682726f..e5a929aa 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .save_op import get_pred_boundary, pred2scene, scene2video
 from .shot_encoder import resnet50
 from .trn import TransformerCRN
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/head.py b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
index 20a87e66..d6468c53 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/head.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -1,9 +1,5 @@
-# ------------------------------------------------------------------------------------
-# BaSSL
-# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# Github: https://github.com/kakaobrain/bassl
-# ------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index d7c8c0ed..cf26d21a 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -1,7 +1,5 @@
-# ----------------------------------------------------------------------------------
-# The codes below partially refer to the SceneSeg LGSS.
-# Github: https://github.com/AnyiRao/SceneSeg
-# ----------------------------------------------------------------------------------
+# The implementation here is modified based on SceneSeg,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/AnyiRao/SceneSeg
 import os
 import os.path as osp
 import subprocess
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
index 7ad1907f..11d20b13 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -1,6 +1,4 @@
-"""
-Modified from original implementation in torchvision
-"""
+# The implementation is adopted from torchvision
 
 from typing import Any, Callable, List, Optional, Type, Union
 
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
index e56039ac..b1bc40f8 100644
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
index 925d6281..68cbf918 100644
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -1,6 +1,5 @@
-# ---------------------------------------------------------------------------------------------------
-# The implementation is built upon BaSSL, publicly available at https://github.com/kakaobrain/bassl
-# ---------------------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 import copy
 import os
 import os.path as osp
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
index 0ef0261d..b5acf17a 100644
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import torch
diff --git a/modelscope/preprocessors/movie_scene_segmentation/__init__.py b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
index 73da792d..b28ccabc 100644
--- a/modelscope/preprocessors/movie_scene_segmentation/__init__.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/preprocessors/movie_scene_segmentation/transforms.py b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
index b4e57420..5b84003c 100644
--- a/modelscope/preprocessors/movie_scene_segmentation/transforms.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -1,9 +1,5 @@
-# ------------------------------------------------------------------------------------
-# The codes below partially refer to the BaSSL
-# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# Github: https://github.com/kakaobrain/bassl
-# ------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 import numbers
 import os.path as osp
 import random
diff --git a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
index ee4dd849..7645f9f3 100644
--- a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer

From b4cff7cc3158212e1c9ff74e3e73cac3eece3d79 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Fri, 23 Sep 2022 15:27:05 +0800
Subject: [PATCH 135/175] =?UTF-8?q?[to=20#44842128]=20=E4=BF=AE=E5=A4=8DMs?=
 =?UTF-8?q?Dataset=20torch=E5=9C=BA=E6=99=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1、to_torch_dataset时支持保留原数据类型
2、替换orch.utils.data.IterableDataset为torch.utils.data.Dataset，支持分布式训练和shuffle。后续等streaming数据加载方式支持后再引入IterableDataset
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10214102
---
 modelscope/msdatasets/ms_dataset.py | 104 ++++++++++++++--------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 0fb877b7..361b8ae0 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -44,44 +44,40 @@ def format_list(para) -> List:
     return para
 
 
-class MsIterableDataset(torch.utils.data.IterableDataset):
+class MsMapDataset(torch.utils.data.Dataset):
 
     def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
-                 columns):
-        super(MsIterableDataset).__init__()
+                 columns, to_tensor):
+        super(MsDataset).__init__()
         self.dataset = dataset
         self.preprocessor_list = preprocessor_list
+        self.to_tensor = to_tensor
         self.retained_columns = retained_columns
         self.columns = columns
 
     def __len__(self):
         return len(self.dataset)
 
-    def __iter__(self):
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is None:  # single-process data loading
-            iter_start = 0
-            iter_end = len(self.dataset)
-        else:  # in a worker process
-            per_worker = math.ceil(
-                len(self.dataset) / float(worker_info.num_workers))
-            worker_id = worker_info.id
-            iter_start = worker_id * per_worker
-            iter_end = min(iter_start + per_worker, len(self.dataset))
+    def type_converter(self, x):
+        if self.to_tensor:
+            return torch.tensor(x)
+        else:
+            return x
 
-        for idx in range(iter_start, iter_end):
-            item_dict = self.dataset[idx]
-            res = {
-                k: torch.tensor(item_dict[k])
-                for k in self.columns if k in self.retained_columns
-            }
-            for preprocessor in self.preprocessor_list:
-                res.update({
-                    k: torch.tensor(v)
-                    for k, v in preprocessor(item_dict).items()
-                    if k in self.retained_columns
-                })
-            yield res
+    def __getitem__(self, index):
+        item_dict = self.dataset[index]
+        res = {
+            k: self.type_converter(item_dict[k])
+            for k in self.columns
+            if (not self.to_tensor) or k in self.retained_columns
+        }
+        for preprocessor in self.preprocessor_list:
+            res.update({
+                k: self.type_converter(v)
+                for k, v in preprocessor(item_dict).items()
+                if (not self.to_tensor) or k in self.retained_columns
+            })
+        return res
 
 
 class MsDataset:
@@ -341,6 +337,7 @@ class MsDataset:
         self,
         preprocessors: Union[Callable, List[Callable]],
         columns: Union[str, List[str]] = None,
+        to_tensor: bool = True,
     ):
         preprocessor_list = preprocessors if isinstance(
             preprocessors, list) else [preprocessors]
@@ -350,28 +347,29 @@ class MsDataset:
         columns = [
             key for key in self._hf_ds.features.keys() if key in columns
         ]
-        sample = next(iter(self._hf_ds))
-
-        sample_res = {k: np.array(sample[k]) for k in columns}
-        for processor in preprocessor_list:
-            sample_res.update(
-                {k: np.array(v)
-                 for k, v in processor(sample).items()})
-
-        def is_numpy_number(value):
-            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
-                value.dtype, np.floating)
-
         retained_columns = []
-        for k in sample_res.keys():
-            if not is_numpy_number(sample_res[k]):
-                logger.warning(
-                    f'Data of column {k} is non-numeric, will be removed')
-                continue
-            retained_columns.append(k)
+        if to_tensor:
+            sample = next(iter(self._hf_ds))
 
-        return MsIterableDataset(self._hf_ds, preprocessor_list,
-                                 retained_columns, columns)
+            sample_res = {k: np.array(sample[k]) for k in columns}
+            for processor in preprocessor_list:
+                sample_res.update(
+                    {k: np.array(v)
+                     for k, v in processor(sample).items()})
+
+            def is_numpy_number(value):
+                return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
+                    value.dtype, np.floating)
+
+            for k in sample_res.keys():
+                if not is_numpy_number(sample_res[k]):
+                    logger.warning(
+                        f'Data of column {k} is non-numeric, will be removed')
+                    continue
+                retained_columns.append(k)
+
+        return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns,
+                            columns, to_tensor)
 
     def to_torch_dataset(
         self,
@@ -379,6 +377,7 @@ class MsDataset:
         preprocessors: Union[Callable, List[Callable]] = None,
         task_name: str = None,
         task_data_config: ConfigDict = None,
+        to_tensor: bool = True,
         **format_kwargs,
     ):
         """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -386,13 +385,14 @@ class MsDataset:
 
         Args:
             preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
-                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
+                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                 will be used as a field of torch.utils.data.Dataset.
-            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
-                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
-                the output fields of processors will also be added.
+            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
+                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
+                If the `preprocessors` is not None, the output fields of processors will also be added.
             task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
             task_data_config (ConfigDict, default None): config dict for model object.
+            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
             format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
 
         Returns:
@@ -409,7 +409,7 @@ class MsDataset:
             return build_task_dataset(task_data_config, task_name)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
-                preprocessors, columns=columns)
+                preprocessors, columns=columns, to_tensor=to_tensor)
         else:
             self._hf_ds.reset_format()
             self._hf_ds.set_format(

From 69f8928dd28653c6a05950af0bbf4c9037b9ce97 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Fri, 23 Sep 2022 15:54:17 +0800
Subject: [PATCH 136/175] [to #42322933] Replace mplug input 'question' with
 'text'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mplug 相关任务 pipeline 输入字段统一为 'image' + 'text'
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10236282
---
 modelscope/preprocessors/multi_modal.py |  3 ++-
 tests/pipelines/test_mplug_tasks.py     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 342ba6b5..f38ff8ae 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -159,7 +159,8 @@ class MPlugPreprocessor(Preprocessor):
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
         question = '' if self.cfg.task == Tasks.image_captioning \
-            else data[1 if isinstance(data, tuple) else 'question']
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
         question = self.tokenizer(
             question.lower(),
             padding='max_length',
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 273d3105..a3ace62d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -44,8 +44,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_visual-question-answering_coco_large_en')
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -54,8 +54,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_visual-question-answering_coco_large_en'
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -65,8 +65,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_image-text-retrieval_flickr30k_large_en')
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)
 
@@ -75,8 +75,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)
 

From 3ed0c9c8d8da5a3bf55f65b5fbcb88fc05f067d5 Mon Sep 17 00:00:00 2001
From: "pengyu.lpy" <pengyu.lpy@alibaba-inc.com>
Date: Sat, 24 Sep 2022 17:58:42 +0800
Subject: [PATCH 137/175] [to #42322933] relax un-determinsitic test validation
 constraints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

放松了tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py里面对于识别非determinsitic的校验条件，一方便后续模型更新
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10245940
---
 .../test_segmentation_pipeline.py             | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index db9c403a..80ab36a6 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -15,7 +15,7 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
 
     img_path = 'data/test/images/image_segmentation.jpg'
 
-    def _internal_test__(self, model_id):
+    def _internal_test_(self, model_id):
         img = np.asarray(Image.open(self.img_path))
 
         semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
@@ -26,12 +26,8 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
         results = outputs[0]
         self.assertListEqual(
             list(img.shape)[:2], list(results['seg_pred'].shape))
-        self.assertListEqual(results['seg_pred'][1, 4:10].tolist(),
-                             [161 for i in range(6)])
-        self.assertListEqual(results['seg_pred'][-1, -10:].tolist(),
-                             [133 for i in range(10)])
 
-    def _internal_test_batch(self, model_id, num_samples=2, batch_size=2):
+    def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
         # TODO: support in the future
         img = np.asarray(Image.open(self.img_path))
         num_samples = num_samples
@@ -48,40 +44,42 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
         for output in outputs:
             self.assertListEqual(
                 list(img.shape)[:2], list(output['seg_pred'].shape))
-            self.assertListEqual(output['seg_pred'][1, 4:10].tolist(),
-                                 [161 for i in range(6)])
-            self.assertListEqual(output['seg_pred'][-1, -10:].tolist(),
-                                 [133 for i in range(10)])
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b0(self):
         model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b1(self):
         model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b2(self):
         model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b3(self):
         model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b4(self):
         model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b5(self):
         model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
 
 if __name__ == '__main__':

From 047904ef73d42eccb33328089642ae6ffe20318d Mon Sep 17 00:00:00 2001
From: myf272609 <myf272609@alibaba-inc.com>
Date: Mon, 26 Sep 2022 11:55:06 +0800
Subject: [PATCH 138/175] [to #42322933] fix init issues for multi-style
 cartoon models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复多风格模型pipeline初始化问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10249429
---
 modelscope/pipelines/cv/image_cartoon_pipeline.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index f34be618..72fda989 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -39,10 +39,13 @@ class ImageCartoonPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         with device_placement(self.framework, self.device_name):
             self.facer = FaceAna(self.model)
-            self.sess_anime_head = self.load_sess(
-                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
-            self.sess_anime_bg = self.load_sess(
-                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
+            with tf.Graph().as_default():
+                self.sess_anime_head = self.load_sess(
+                    os.path.join(self.model, 'cartoon_h.pb'),
+                    'model_anime_head')
+                self.sess_anime_bg = self.load_sess(
+                    os.path.join(self.model, 'cartoon_bg.pb'),
+                    'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))

From 5e4894870bf56585f294f24bc485d97ab1420e4e Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 26 Sep 2022 12:23:28 +0800
Subject: [PATCH 139/175] [to #42322933]add t5 model / text2text generation
 task         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10191736

    * add T5 for generation
---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/T5/__init__.py          |   21 +
 modelscope/models/nlp/T5/configuration_t5.py  |  174 ++
 modelscope/models/nlp/T5/modeling_t5.py       | 2003 +++++++++++++++++
 .../models/nlp/T5/t5_for_text_generation.py   |   56 +
 modelscope/models/nlp/__init__.py             |    3 +-
 modelscope/outputs.py                         |    7 +
 modelscope/pipelines/nlp/__init__.py          |    4 +-
 .../nlp/text2text_generation_pipeline.py      |   87 +
 modelscope/preprocessors/__init__.py          |    3 +-
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 modelscope/preprocessors/nlp/nlp_base.py      |   35 +
 modelscope/utils/constant.py                  |    1 +
 tests/pipelines/test_text2text_generation.py  |   61 +
 14 files changed, 2457 insertions(+), 3 deletions(-)
 create mode 100644 modelscope/models/nlp/T5/__init__.py
 create mode 100644 modelscope/models/nlp/T5/configuration_t5.py
 create mode 100644 modelscope/models/nlp/T5/modeling_t5.py
 create mode 100644 modelscope/models/nlp/T5/t5_for_text_generation.py
 create mode 100644 modelscope/pipelines/nlp/text2text_generation_pipeline.py
 create mode 100644 tests/pipelines/test_text2text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 80a522b2..29a35fbe 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -65,6 +65,7 @@ class Models(object):
     plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
+    T5 = 'T5'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -179,6 +180,7 @@ class Pipelines(object):
     part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     sentiment_analysis = 'sentiment-analysis'
     sentiment_classification = 'sentiment-classification'
     text_classification = 'text-classification'
@@ -280,6 +282,7 @@ class Preprocessors(object):
     cross_encoder_tokenizer = 'cross-encoder-tokenizer'
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
+    text2text_gen_preprocessor = 'text2text-gen-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py
new file mode 100644
index 00000000..7c1cea36
--- /dev/null
+++ b/modelscope/models/nlp/T5/__init__.py
@@ -0,0 +1,21 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .t5_for_text_generation import T5ForConditionalGeneration
+
+else:
+    _import_structure = {
+        't5_for_text_generation': ['T5ForConditionalGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration_t5.py
new file mode 100644
index 00000000..117a6bc1
--- /dev/null
+++ b/modelscope/models/nlp/T5/configuration_t5.py
@@ -0,0 +1,174 @@
+# Copyright 2020, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration"""
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxSeq2SeqConfigWithPast
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class T5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
+    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the T5
+    [t5-small](https://huggingface.co/t5-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 32128):
+            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
+            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = 't5'
+    keys_to_ignore_at_inference = ['past_key_values']
+    attribute_map = {
+        'hidden_size': 'd_model',
+        'num_attention_heads': 'num_heads',
+        'num_hidden_layers': 'num_layers'
+    }
+
+    def __init__(self,
+                 vocab_size=32128,
+                 d_model=512,
+                 d_kv=64,
+                 d_ff=2048,
+                 num_layers=6,
+                 num_decoder_layers=None,
+                 num_heads=8,
+                 relative_attention_num_buckets=32,
+                 relative_attention_max_distance=128,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
+                 initializer_factor=1.0,
+                 feed_forward_proj='relu',
+                 is_encoder_decoder=True,
+                 use_cache=True,
+                 pad_token_id=0,
+                 eos_token_id=1,
+                 **kwargs):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (num_decoder_layers if num_decoder_layers
+                                   is not None else self.num_layers
+                                   )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+
+        act_info = self.feed_forward_proj.split('-')
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == 'gated'
+
+        if len(act_info) > 1 and act_info[0] != 'gated' or len(act_info) > 2:
+            raise ValueError(
+                f'`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer.'
+                'Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. '
+                "'gated-gelu' or 'relu'")
+
+        # for backwards compatibility
+        if feed_forward_proj == 'gated-gelu':
+            self.dense_act_fn = 'gelu_new'
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+class T5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            'input_ids': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+            'attention_mask': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+        }
+        if self.use_past:
+            common_inputs['attention_mask'][
+                1] = 'past_encoder_sequence + sequence'
+            common_inputs['decoder_input_ids'] = {0: 'batch'}
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'past_decoder_sequence + sequence'
+            }
+        else:
+            common_inputs['decoder_input_ids'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction='inputs')
+
+        return common_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/modeling_t5.py
new file mode 100644
index 00000000..da50741e
--- /dev/null
+++ b/modelscope/models/nlp/T5/modeling_t5.py
@@ -0,0 +1,2003 @@
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model."""
+
+import copy
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput, Seq2SeqModelOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_torch_fx_proxy, replace_return_docstrings)
+from transformers.utils.model_parallel_utils import (assert_device_map,
+                                                     get_device_map)
+
+from modelscope.utils.logger import get_logger
+from .configuration_t5 import T5Config
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'T5Config'
+_TOKENIZER_FOR_DOC = 'T5Tokenizer'
+_CHECKPOINT_FOR_DOC = 't5-small'
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    't5-small',
+    't5-base',
+    't5-large',
+    't5-3b',
+    't5-11b',
+    # See all T5 models at https://huggingface.co/models?filter=t5
+]
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ['kernel', 'scale', 'embedding']:
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'self_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[0]
+            elif scope_names[0] == 'enc_dec_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[1]
+            elif scope_names[0] == 'dense_relu_dense':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[2]
+            elif scope_names[0] == 'rms_norm':
+                if hasattr(pointer, 'layer_norm'):
+                    pointer = getattr(pointer, 'layer_norm')
+                elif hasattr(pointer, 'final_layer_norm'):
+                    pointer = getattr(pointer, 'final_layer_norm')
+            elif scope_names[0] == 'scale':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            elif scope_names[0] == 'decoder' and name[1] == 'logits':
+                continue
+            elif scope_names[0] == 'logits':
+                pointer = getattr(pointer, 'lm_head')
+            elif scope_names[0] == 'wi' and len(
+                    scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f'wi_{scope_names[1]}')
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ['kernel', 'scale', 'embedding']:
+            pointer = getattr(pointer, 'weight')
+        if scope_names[0] != 'embedding':
+            logger.info(
+                f'Transposing numpy weight of shape {array.shape} for {name}')
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info(
+        f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}."
+    )
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
+            following number of attention modules:
+
+                - t5-small: 6
+                - t5-base: 12
+                - t5-large: 24
+                - t5-3b: 24
+                - t5-11b: 24
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs
+    # using t5-3b, which has a total of 24 attention modules:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+class T5LayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(
+            -1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance
+                                                    + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info(
+        'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
+    )
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning(
+        'discovered apex but it failed to load, falling back to T5LayerNorm')
+    pass
+
+
+class T5DenseReluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = nn.functional.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedGeluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN['gelu_new']
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.feed_forward_proj == 'relu':
+            self.DenseReluDense = T5DenseReluDense(config)
+        elif config.feed_forward_proj == 'gated-gelu':
+            self.DenseReluDense = T5DenseGatedGeluDense(config)
+        else:
+            raise ValueError(
+                f'{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`'
+            )
+
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(
+                self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relateive_pos_log = torch.log(relative_position.float() / max_exact)
+        max_dis_log = math.log(max_distance / max_exact)
+        origin_relative_position = relateive_pos_log / max_dis_log * (
+            num_buckets - max_exact)
+        relative_postion_if_large = max_exact + origin_relative_position.to(
+            torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1))
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(
+            query_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[:, None]
+        memory_position = torch.arange(
+            key_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f'past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states'
+            real_seq_length += past_key_value[0].shape[
+                2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[
+            1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads,
+                               self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(
+                batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states,
+                    past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states],
+                                              dim=2)
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(
+            hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states,
+            past_key_value[0] if past_key_value is not None else None)
+        value_states = project(
+            hidden_states, self.v, key_value_states,
+            past_key_value[1] if past_key_value is not None else None)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=scores.device,
+                    dtype=scores.dtype)
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1):, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = nn.functional.softmax(
+            scores.float(), dim=-1).type_as(
+                scores)  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(
+            attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states,
+                                   value_states) if (self.is_decoder
+                                                     and use_cache) else None
+        outputs = (attn_output, ) + (present_key_value_state, ) + (
+            position_bias, )
+
+        if output_attentions:
+            outputs = outputs + (attn_weights, )
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = T5Attention(
+            config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(
+            T5LayerSelfAttention(
+                config,
+                has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config))
+
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            if not self.is_decoder:
+                logger.warning(
+                    '`past_key_values` is passed to the encoder. Please make sure this is intended.'
+                )
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f'There should be {expected_num_past_key_values} past states. '
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f'Got {len(past_key_value)} past key / value states')
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[
+            2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(
+                    hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(
+                    hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[
+                    1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state, ) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        # hidden-states, present_key_value_states, (self-attention position
+        # bias), (self-attention weights), (cross-attention position bias),
+        # (cross-attention weights)
+        return outputs
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = 'transformer'
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            'decoder_input_ids': input_ids,
+            'input_ids': input_ids,
+            'decoder_attention_mask': input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module,
+                        (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization See
+            # https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedGeluDense):
+            module.wi_0.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_0, 'bias') and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_1, 'bias') and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before
+            # softmax See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(
+                mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5))
+            module.k.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(
+                mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(
+                    mean=0.0, std=factor * ((d_model)**-0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), 'self.model.config.decoder_start_token_id has to be defined.'
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1, ),
+                                           decoder_start_token_id)
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, 'self.model.config.pad_token_id has to be defined.'
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(
+        ), 'Verify that `shifted_input_ids` has only positive values'
+
+        return shifted_input_ids
+
+
+class T5Stack(T5PreTrainedModel):
+
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList([
+            T5Block(config, has_relative_attention_bias=bool(i == 0))
+            for i in range(config.num_layers)
+        ])
+        self.final_layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = 'cpu' if 'cpu' in self.device_map.keys(
+        ) else 'cuda:' + str(min(self.device_map.keys()))
+        self.last_device = 'cuda:' + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = 'cuda:' + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = 'cpu'
+        self.last_device = 'cpu'
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to('cpu')
+        self.embed_tokens = self.embed_tokens.to('cpu')
+        self.final_layer_norm = self.final_layer_norm.to('cpu')
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds'
+            )
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, 'You have to initialize the model with valid token embeddings'
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[
+            2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f'`use_cache` can only be set to `True` if {self} is used as a decoder'
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length).to(
+                inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size,
+                encoder_seq_length,
+                device=inputs_embeds.device,
+                dtype=torch.long)
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, inputs_embeds.device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask,
+                                                  self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions
+                                      and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module,
+                past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                        hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device)
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return tuple(
+                            module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with: hidden-states, key-value-states,
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (
+                    None, ) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer
+            # store them layer_outputs = hidden-states, key-value-states
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state, )
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3], )
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[5], )
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and 'cuda:' + str(k) != self.last_device:
+                        hidden_states = hidden_states.to('cuda:' + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                present_key_value_states,
+                all_hidden_states,
+                all_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in [Exploring the Limits of Transfer Learning with
+    a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
+    Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder
+    transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`T5Config`]): Model configuration class with all the parameters
+        of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for
+            `decoder_input_ids` generation. If `past_key_values` is used,
+            optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining
+            take a look at [T5 Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in
+            `decoder_input_ids`. Causal mask will also be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in
+                the decoder. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+            sequence_length, hidden_size)` is a sequence of hidden states at the
+            output of the last layer of the encoder. Used in the cross-attention
+            of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to
+            directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more
+            control over how to convert `decoder_input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@add_start_docstrings(
+    'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
+    T5_START_DOCSTRING,
+)
+class T5Model(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import T5Tokenizer, T5Model
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
+                      T5_START_DOCSTRING)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+        r'lm_head\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.lm_head = self.lm_head.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+            labels set to `-100` are ignored (masked), the loss is only computed
+            for labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python >>> from transformers import T5Tokenizer,
+        T5ForConditionalGeneration
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
+            return ((loss, ) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      head_mask=None,
+                                      decoder_head_mask=None,
+                                      cross_attn_head_mask=None,
+                                      use_cache=None,
+                                      encoder_outputs=None,
+                                      **kwargs):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'decoder_input_ids': input_ids,
+            'past_key_values': past,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                'You might want to consider setting `use_cache=True` to speed up decoding'
+            )
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)), )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[
+                0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states, )
+        return reordered_decoder_past
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5EncoderModel(T5PreTrainedModel):
+    authorized_missing_keys = [
+        r'encoder\.embed_tokens\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import T5Tokenizer, T5EncoderModel
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py
new file mode 100644
index 00000000..27f077d8
--- /dev/null
+++ b/modelscope/models/nlp/T5/t5_for_text_generation.py
@@ -0,0 +1,56 @@
+from typing import Optional, Tuple
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from .modeling_t5 import T5Config
+from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
+
+
+@MODELS.register_module(
+    group_key=Tasks.text2text_generation,
+    module_name=Models.T5,
+)
+class T5ForConditionalGeneration(TorchModel):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model = T5ForGeneration.from_pretrained(model_dir)
+        self.generate = self.model.generate
+        self.config = self.model.config
+
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                decoder_attention_mask: Optional[torch.BoolTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                decoder_head_mask: Optional[torch.FloatTensor] = None,
+                cross_attn_head_mask: Optional[torch.Tensor] = None,
+                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                **kwargs):
+        return self.model.forward(
+            self, input_ids, attention_mask, decoder_input_ids,
+            decoder_attention_mask, head_mask, decoder_head_mask,
+            cross_attn_head_mask, encoder_outputs, past_key_values,
+            inputs_embeds, decoder_inputs_embeds, labels, use_cache,
+            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 443cb214..152a32dc 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
     from .token_classification import SbertForTokenClassification
     from .sentence_embedding import SentenceEmbedding
     from .passage_ranking import PassageRanking
-
+    from .T5 import T5ForConditionalGeneration
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
@@ -68,6 +68,7 @@ else:
         'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
         'passage_ranking': ['PassageRanking'],
+        'T5': ['T5ForConditionalGeneration'],
     }
 
     import sys
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b3eb9ad8..a80cbf33 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -390,12 +390,19 @@ TASK_OUTPUTS = {
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
     Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
     Tasks.passage_ranking: [OutputKeys.SCORES],
+
     # text generation result for single sample
     # {
     #   "text": "this is the text generated by a model."
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
+    # text generation result for single sample
+    # {
+    #   "text": "北京"
+    # }
+    Tasks.text2text_generation: [OutputKeys.TEXT],
+
     # fill mask result for single sample
     # {
     #   "text": "this is the text which masks filled by model."
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index b5c53f82..a8edc21a 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .fill_mask_pipeline import FillMaskPipeline
-    from .fill_mask_ponet_pipeline import FillMaskPoNetPreprocessor
+    from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
@@ -22,6 +22,7 @@ if TYPE_CHECKING:
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
+    from .text2text_generation_pipeline import Text2TextGenerationPipeline
     from .token_classification_pipeline import TokenClassificationPipeline
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
@@ -54,6 +55,7 @@ else:
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
+        'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
         'token_classification_pipeline': ['TokenClassificationPipeline'],
         'translation_pipeline': ['TranslationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
new file mode 100644
index 00000000..9ccd00f4
--- /dev/null
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['Text2TextGenerationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+class Text2TextGenerationPipeline(Pipeline):
+
+    def __init__(
+            self,
+            model: Union[Model, str],
+            preprocessor: Optional[Text2TextGenerationPreprocessor] = None,
+            first_sequence='sentence',
+            **kwargs):
+        """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the text generation task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the first sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(task='text-generation',
+            >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
+            >>> sentence1 = '本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：'
+            >>>     '1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代'
+            >>> print(pipeline_ins(sentence1))
+            >>> # Or use the dict input:
+            >>> print(pipeline_ins({'sentence': sentence1}))
+
+            To view other examples plese check the tests/pipelines/test_text_generation.py.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = Text2TextGenerationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        self.tokenizer = preprocessor.tokenizer
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+
+        forward_params['min_length'] = forward_params.get(
+            'min_length', self.model.config.min_length)
+        forward_params['max_length'] = forward_params.get(
+            'max_length', self.model.config.max_length)
+
+        with torch.no_grad():
+            output_ids = self.model.generate(**inputs, **forward_params)
+            return {'output_ids': output_ids}
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        output = self.tokenizer.decode(
+            inputs['output_ids'][0],
+            skip_special_tokens=True,
+        )
+        return {OutputKeys.TEXT: output}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index ba03a35e..e37b3324 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
         DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor,
+        PassageRankingPreprocessor, Text2TextGenerationPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -57,6 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
+            'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index eee5e80f..f305df27 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
         Tokenize, SequenceClassificationPreprocessor,
         TextGenerationPreprocessor, TokenClassificationPreprocessor,
         SingleSentenceClassificationPreprocessor,
+        Text2TextGenerationPreprocessor,
         PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
         ZeroShotClassificationPreprocessor, NERPreprocessor,
         FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
@@ -27,6 +28,7 @@ else:
             'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
+            'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 0a2495af..d294f517 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -26,6 +26,7 @@ __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
     'PairSentenceClassificationPreprocessor',
+    'Text2TextGenerationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
@@ -442,6 +443,40 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         return features
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        self.tokenizer = self.build_tokenizer(
+            model_dir) if tokenizer is None else tokenizer
+        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
+        kwargs['padding'] = kwargs.get('padding', False)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        text_a, _, _ = self.parse_text_and_label(data)
+
+        inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+        return inputs
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
 class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d6b0da40..4c5d2f41 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -97,6 +97,7 @@ class NLPTasks(object):
     token_classification = 'token-classification'
     conversational = 'conversational'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     task_oriented_conversation = 'task-oriented-conversation'
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_state_tracking = 'dialog-state-tracking'
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
new file mode 100644
index 00000000..04cecf93
--- /dev/null
+++ b/tests/pipelines/test_text2text_generation.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import T5ForConditionalGeneration
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import Text2TextGenerationPipeline
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/t5-cn-base-test'
+        self.input = '中国的首都位于<extra_id_0>。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_T5(self):
+        cache_path = snapshot_download(self.model_id)
+        model = T5ForConditionalGeneration(cache_path)
+        preprocessor = Text2TextGenerationPreprocessor(cache_path)
+        pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.text2text_generation, model=model, preprocessor=preprocessor)
+        print(
+            f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
+        )
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_instance(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.input))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_id(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation, model=self.model_id)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip(
+        'only for test cases, there is no default official model yet')
+    def test_run_pipeline_without_model_id(self):
+        pipeline_ins = pipeline(task=Tasks.text2text_generation)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4dbdc45963a769d43afe2d75c1ebc7964c359c9d Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Mon, 26 Sep 2022 13:23:32 +0800
Subject: [PATCH 140/175] test(data): add test data         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10246518

---
 data/test/videos/Walking.54138969.mp4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4
index 1716695f..d4355290 100644
--- a/data/test/videos/Walking.54138969.mp4
+++ b/data/test/videos/Walking.54138969.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
-size 44217644
+oid sha256:7663f9a32ea57086bf66c4b9e9ebe0fd418986c67716c7be02ca917e72ddc0ba
+size 8155895

From b876839d51b81a14e6caaba87d6fb0c9f646a0c8 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Mon, 26 Sep 2022 14:03:35 +0800
Subject: [PATCH 141/175] [to #42322933]adjust output form

adjust output form for movie scene segmentation demo

 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244194
---
 .../models/cv/movie_scene_segmentation/model.py     |  4 ++--
 .../cv/movie_scene_segmentation/utils/save_op.py    | 13 ++++++-------
 modelscope/outputs.py                               | 11 +++++------
 .../cv/movie_scene_segmentation_pipeline.py         |  4 ++--
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index 676b5ac1..1232d427 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -162,11 +162,11 @@ class MovieSceneSegmentationModel(TorchModel):
         thres = self.cfg.pipeline.save_threshold
 
         anno_dict = get_pred_boundary(pred_dict, thres)
-        scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
         if self.cfg.pipeline.save_split_scene:
             re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
             print(f'Split scene video saved to {re_dir}')
-        return len(scene_list), scene_dict
+        return len(scene_list), scene_dict_lst
 
     def preprocess(self, inputs):
         logger.info('Begin shot detect......')
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index cf26d21a..6361c056 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -21,16 +21,15 @@ def get_pred_boundary(pred_dict, threshold=0.5):
 def pred2scene(shot2keyf, anno_dict):
     scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
 
-    scene_dict = {}
+    scene_dict_lst = []
     assert len(scene_list) == len(pair_list)
     for scene_ind, scene_item in enumerate(scene_list):
-        scene_dict.update(
-            {scene_ind: {
-                'shot': pair_list[scene_ind],
-                'frame': scene_item
-            }})
+        scene_dict_lst.append({
+            'shot': pair_list[scene_ind],
+            'frame': scene_item
+        })
 
-    return scene_dict, scene_list
+    return scene_dict_lst, scene_list
 
 
 def scene2video(source_movie_fn, scene_list, thres):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index a80cbf33..052d4f33 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -38,7 +38,7 @@ class OutputKeys(object):
     HISTORY = 'history'
     TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
-    SPLIT_META_DICT = 'split_meta_dict'
+    SPLIT_META_LIST = 'split_meta_list'
 
 
 TASK_OUTPUTS = {
@@ -293,18 +293,17 @@ TASK_OUTPUTS = {
     # movide scene segmentation result for a single video
     # {
     #        "split_video_num":3,
-    #        "split_meta_dict":
-    #        {
-    #           scene_id:
+    #        "split_meta_list":
+    #        [
     #           {
     #               "shot": [0,1,2],
     #               "frame": [start_frame, end_frame]
     #           }
-    #        }
+    #        ]
     #
     # }
     Tasks.movie_scene_segmentation:
-    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_DICT],
+    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
 
     # ============ nlp tasks ===================
 
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
index b5acf17a..6704e4c0 100644
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -60,9 +60,9 @@ class MovieSceneSegmentationPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
-        video_num, meta_dict = self.model.postprocess(data)
+        video_num, meta_lst = self.model.postprocess(data)
         result = {
             OutputKeys.SPLIT_VIDEO_NUM: video_num,
-            OutputKeys.SPLIT_META_DICT: meta_dict
+            OutputKeys.SPLIT_META_LIST: meta_lst
         }
         return result

From bd4127bc27120f460f90f5f75832d8d3830e5b06 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Mon, 26 Sep 2022 15:49:35 +0800
Subject: [PATCH 142/175] =?UTF-8?q?[to=20#42322933]segformer=20=E6=8E=A5?=
 =?UTF-8?q?=E5=85=A5demo=E6=8E=A5=E5=8F=A3=E6=9B=B4=E6=94=B9=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/Ma?=
 =?UTF-8?q?aS-lib/codereview/10253628?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../easycv_pipelines/segmentation_pipeline.py | 24 ++++++++++++++
 .../test_segmentation_pipeline.py             | 32 ++++++++++---------
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
index 2182e3b3..bd09fc9b 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
@@ -1,5 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+import numpy as np
+
 from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import Tasks
 from .base import EasyCVPipeline
@@ -21,3 +26,22 @@ class EasyCVSegmentationPipeline(EasyCVPipeline):
             model_file_pattern=model_file_pattern,
             *args,
             **kwargs)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        semantic_result = outputs[0]['seg_pred']
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != len(self.predict_op.CLASSES)  # for VOID label
+        ids = ids[legal_indices]
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.predict_op.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index 80ab36a6..5f6dac4b 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -2,30 +2,34 @@
 import unittest
 from distutils.version import LooseVersion
 
+import cv2
 import easycv
 import numpy as np
 from PIL import Image
 
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class EasyCVSegmentationPipelineTest(unittest.TestCase):
-
+class EasyCVSegmentationPipelineTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
     img_path = 'data/test/images/image_segmentation.jpg'
 
-    def _internal_test_(self, model_id):
-        img = np.asarray(Image.open(self.img_path))
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
 
+    def _internal_test_(self, model_id):
         semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
         outputs = semantic_seg(self.img_path)
 
-        self.assertEqual(len(outputs), 1)
-
-        results = outputs[0]
-        self.assertListEqual(
-            list(img.shape)[:2], list(results['seg_pred'].shape))
+        draw_img = semantic_seg_masks_to_image(outputs[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test ' + model_id + ' DONE')
 
     def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
         # TODO: support in the future
@@ -49,37 +53,35 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
     def test_segformer_b0(self):
         model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b1(self):
         model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b2(self):
         model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b3(self):
         model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b4(self):
         model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b5(self):
         model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':

From f844f73b03ed5c47ef6e32ec9359c8984af8a02a Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Mon, 26 Sep 2022 15:52:03 +0800
Subject: [PATCH 143/175] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E5=A4=8Dnano?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=88=9D=E5=A7=8B=E5=8C=96/=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=96=87=E4=BB=B6copyright=E4=BF=A1=E6=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复nano模型初始化/增加文件copyright信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247456
---
 .../cv/realtime_object_detection/realtime_detector.py      | 7 ++++++-
 .../yolox/exp/default/yolox_nano.py                        | 3 ++-
 .../pipelines/cv/realtime_object_detection_pipeline.py     | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
index b147f769..2b4b3f8c 100644
--- a/modelscope/models/cv/realtime_object_detection/realtime_detector.py
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 import logging as logger
 import os
@@ -48,6 +49,7 @@ class RealtimeDetector(TorchModel):
         self.nmsthre = self.exp.nmsthre
         self.test_size = self.exp.test_size
         self.preproc = ValTransform(legacy=False)
+        self.label_mapping = self.config['labels']
 
     def inference(self, img):
         with torch.no_grad():
@@ -81,5 +83,8 @@ class RealtimeDetector(TorchModel):
             bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
             scores = outputs[0][:, 5].cpu().numpy()
             labels = outputs[0][:, 6].cpu().int().numpy()
+            pred_label_names = []
+            for lab in labels:
+                pred_label_names.append(self.label_mapping[lab])
 
-        return bboxes, scores, labels
+        return bboxes, scores, pred_label_names
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
index 330eef16..7bada485 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
@@ -42,5 +42,6 @@ class YoloXNanoExp(YoloXExp):
                 act=self.act,
                 depthwise=True)
             self.model = YOLOX(backbone, head)
-
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
         return self.model
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
index 629720d1..9f558f88 100644
--- a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
+++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict, List, Union
 

From 65cce5b9976db9873ceb3fa1687903546f679e0d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 26 Sep 2022 16:12:17 +0800
Subject: [PATCH 144/175]  [to #44902165] bump version to 0.4.5

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 9a8e054a..68eb9b68 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.4'
+__version__ = '0.4.5'

From c498d88d48a8c8cdd85c963322795914dabc9f42 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 26 Sep 2022 17:38:13 +0800
Subject: [PATCH 145/175] [to #42322933] add license declaration

1. add license declaration
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10216802
---
 .../metrics/sequence_classification_metric.py |  2 ++
 modelscope/metrics/text_generation_metric.py  |  2 ++
 .../metrics/token_classification_metric.py    |  2 ++
 .../models/multi_modal/mplug/clip/__init__.py |  2 ++
 .../models/multi_modal/mplug/predictor.py     | 16 +++++++++++++
 .../models/multi_modal/mplug_for_all_tasks.py |  2 ++
 modelscope/models/nlp/backbones/structbert.py |  1 +
 .../nlp/bart_for_text_error_correction.py     |  1 +
 .../nlp/bert_for_sequence_classification.py   |  1 +
 .../models/nlp/csanmt_for_translation.py      |  3 +++
 .../nlp/gpt3/gpt3_for_text_generation.py      |  1 +
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |  1 +
 .../nlp/heads/infromation_extraction_head.py  |  5 +---
 .../nlp/heads/sequence_classification_head.py |  1 +
 .../nlp/heads/token_classification_head.py    |  1 +
 .../models/nlp/heads/torch_pretrain_head.py   |  1 +
 modelscope/models/nlp/masked_language.py      |  3 +--
 .../nlp/nncrf_for_named_entity_recognition.py |  6 +++--
 .../models/nlp/palm_v2/modeling_palm.py       | 16 +++++++++++++
 .../nlp/palm_v2/palm_for_text_generation.py   |  1 +
 modelscope/models/nlp/passage_ranking.py      |  2 ++
 modelscope/models/nlp/sentence_embedding.py   |  4 ++--
 .../models/nlp/sequence_classification.py     |  2 ++
 .../nlp/task_models/information_extraction.py |  5 +---
 .../task_models/sequence_classification.py    |  1 +
 .../models/nlp/task_models/task_model.py      |  1 +
 .../nlp/task_models/token_classification.py   |  1 +
 modelscope/models/nlp/token_classification.py |  2 ++
 .../nlp/dialog_state_tracking_pipeline.py     |  2 ++
 .../nlp/distributed_plug_pipeline.py          |  2 ++
 .../nlp/faq_question_answering_pipeline.py    |  2 ++
 .../pipelines/nlp/fill_mask_pipeline.py       |  2 ++
 .../nlp/information_extraction_pipeline.py    |  5 ++--
 .../nlp/named_entity_recognition_pipeline.py  |  2 ++
 .../pair_sentence_classification_pipeline.py  |  2 ++
 .../pipelines/nlp/passage_ranking_pipeline.py |  2 ++
 .../nlp/sentence_embedding_pipeline.py        |  2 ++
 .../sequence_classification_pipeline_base.py  |  2 ++
 ...single_sentence_classification_pipeline.py |  2 ++
 .../nlp/text_error_correction_pipeline.py     |  2 ++
 .../pipelines/nlp/text_generation_pipeline.py |  2 ++
 .../nlp/token_classification_pipeline.py      |  2 ++
 .../pipelines/nlp/translation_pipeline.py     |  2 ++
 .../nlp/word_segmentation_pipeline.py         |  2 ++
 .../nlp/zero_shot_classification_pipeline.py  |  2 ++
 modelscope/preprocessors/__init__.py          |  3 ++-
 modelscope/preprocessors/nlp/__init__.py      |  1 +
 modelscope/preprocessors/nlp/nlp_base.py      | 24 ++++++++++++-------
 .../nlp/csanmt_translation_trainer.py         |  2 ++
 .../trainers/nlp/passage_ranking_trainer.py   |  2 ++
 .../nlp/sequence_classification_trainer.py    |  2 ++
 .../nlp/space/dialog_intent_trainer.py        |  2 ++
 .../nlp/space/dialog_modeling_trainer.py      |  2 ++
 .../nlp/space/metrics/metrics_tracker.py      |  4 +---
 modelscope/trainers/nlp_trainer.py            |  2 ++
 55 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index d795d8a2..51a829ef 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index 6bdcbc58..f154281d 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 from modelscope.metainfo import Metrics
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 53d13b6a..05b72170 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 from typing import Dict, List, Optional, Union
 
diff --git a/modelscope/models/multi_modal/mplug/clip/__init__.py b/modelscope/models/multi_modal/mplug/clip/__init__.py
index 05826f46..e6007a04 100644
--- a/modelscope/models/multi_modal/mplug/clip/__init__.py
+++ b/modelscope/models/multi_modal/mplug/clip/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .clip import load_from_config
diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py
index c976baa1..6375d1d7 100755
--- a/modelscope/models/multi_modal/mplug/predictor.py
+++ b/modelscope/models/multi_modal/mplug/predictor.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 
 import torch
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index d61fea10..64a7dd7b 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Dict, List
 
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
index f47900c3..74735520 100644
--- a/modelscope/models/nlp/backbones/structbert.py
+++ b/modelscope/models/nlp/backbones/structbert.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import BACKBONES
diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart_for_text_error_correction.py
index 2339f221..27abedb5 100644
--- a/modelscope/models/nlp/bart_for_text_error_correction.py
+++ b/modelscope/models/nlp/bart_for_text_error_correction.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
index 75105f36..2b1a3b3b 100644
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt_for_translation.py
index 83b58060..4bac8e6d 100644
--- a/modelscope/models/nlp/csanmt_for_translation.py
+++ b/modelscope/models/nlp/csanmt_for_translation.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from THUMT,
+# publicly available at https://github.com/THUNLP-MT/THUMT
+# Copyright 2017-2022 The Alibaba MT Team Authors. All rights reserved.
 import math
 from collections import namedtuple
 from typing import Dict
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index fe1402e8..d686ea30 100644
--- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 69e9ba7c..498d15de 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
index cf957834..6c3388f0 100644
--- a/modelscope/models/nlp/heads/infromation_extraction_head.py
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -1,13 +1,10 @@
-from typing import Dict
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from modelscope.metainfo import Heads
 from modelscope.models.base import TorchHead
 from modelscope.models.builder import HEADS
-from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index e608f035..fb03b7ff 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index 481524ae..ace3deac 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
index 6ff6c96f..fb54637b 100644
--- a/modelscope/models/nlp/heads/torch_pretrain_head.py
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 4f466c23..514a04cd 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,6 +1,5 @@
-from typing import Any, Dict, Optional, Union
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
-import numpy as np
 from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 37216510..62198ed2 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -1,3 +1,7 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
+# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
+
 import os
 from typing import Any, Dict, List, Optional
 
@@ -208,8 +212,6 @@ class CRF(nn.Module):
        Learning*. Morgan Kaufmann. pp. 282–289.
     .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
 
-    The implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
-    and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
     """
 
     def __init__(self, num_tags: int, batch_first: bool = False) -> None:
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index 99b00454..f395ebd4 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import codecs
 import copy
 import math
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index ae92427e..2c37afd6 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict, List
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py
index 68bca231..2a06ce45 100644
--- a/modelscope/models/nlp/passage_ranking.py
+++ b/modelscope/models/nlp/passage_ranking.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
index 955c0e53..340c133f 100644
--- a/modelscope/models/nlp/sentence_embedding.py
+++ b/modelscope/models/nlp/sentence_embedding.py
@@ -1,7 +1,7 @@
-import os
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
-import json
 import numpy as np
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index e8802dbd..a8930e68 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 
 from torch import nn
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 20a44787..4792d07c 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -1,7 +1,7 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
-import torch
 
 from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
@@ -9,9 +9,6 @@ from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 
 __all__ = ['InformationExtractionModel']
 
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 80bfd476..43a96327 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 104b4c32..e93dd5f6 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
 import re
 from abc import ABC
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 29679838..5c22098f 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index 0be921d0..c3723a61 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 from typing import Dict
 
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index 0d2c96d7..79d32ace 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
index 202e6213..e5c05e86 100644
--- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import torch
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 65831a17..1d46d8fd 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index db6b61c6..12f4b80f 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 4cb138d6..07223d07 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -1,11 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       RelationExtractionPreprocessor)
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 8fbdde86..467d7aba 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
index 5248db8c..bdb75c73 100644
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Union
 
 from modelscope.models.base import Model
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
index c03e7b93..1d818ac0 100644
--- a/modelscope/pipelines/nlp/passage_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index 3ef6d06b..16dedb2e 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
index 28bbc732..3d8e8fea 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import numpy as np
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
index 844c6839..0a2f6d25 100644
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Union
 
 from ...metainfo import Pipelines
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index b63d8d36..8e9bf85d 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 3d27ffa9..ea35763f 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 804f8146..aabf48d8 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index e4893577..eb7f7f74 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 7e8b22bc..9d4bb67f 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 38c0ee77..fc7051c7 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import torch
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index e37b3324..b4be1845 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -24,7 +24,8 @@ if TYPE_CHECKING:
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
         DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor, Text2TextGenerationPreprocessor,
+        PassageRankingPreprocessor, SentenceEmbeddingPreprocessor,
+        Text2TextGenerationPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index f305df27..8e75ae98 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
         FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
         RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
         FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        SentenceEmbeddingPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
 
 else:
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index d294f517..d6325eed 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -23,16 +23,24 @@ from modelscope.utils.type_assert import type_assert
 logger = get_logger()
 
 __all__ = [
-    'Tokenize', 'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
+    'Tokenize',
+    'SequenceClassificationPreprocessor',
+    'TextGenerationPreprocessor',
+    'TokenClassificationPreprocessor',
     'PairSentenceClassificationPreprocessor',
     'Text2TextGenerationPreprocessor',
-    'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-    'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-    'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
-    'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor',
-    'FillMaskPoNetPreprocessor'
+    'SingleSentenceClassificationPreprocessor',
+    'FillMaskPreprocessor',
+    'ZeroShotClassificationPreprocessor',
+    'NERPreprocessor',
+    'SentenceEmbeddingPreprocessor',
+    'PassageRankingPreprocessor',
+    'FaqQuestionAnsweringPreprocessor',
+    'SequenceLabelingPreprocessor',
+    'RelationExtractionPreprocessor',
+    'DocumentSegmentationPreprocessor',
+    'FillMaskPoNetPreprocessor',
+    'WordSegmentationBlankSetToLabelPreprocessor',
 ]
 
 
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index 62ae91a8..c93599c7 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Dict, Optional
 
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py
index e54c2904..711fd0c4 100644
--- a/modelscope/trainers/nlp/passage_ranking_trainer.py
+++ b/modelscope/trainers/nlp/passage_ranking_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import time
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py
index 64fd59b4..ec46e037 100644
--- a/modelscope/trainers/nlp/sequence_classification_trainer.py
+++ b/modelscope/trainers/nlp/sequence_classification_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import time
 from typing import Dict, Optional, Tuple, Union
 
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
index c559ee5b..2e59cd80 100644
--- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import time
 from typing import Callable, Dict, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
index 6bdd8a3a..726404d4 100644
--- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import time
 from typing import Callable, Dict, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
index 865600d3..340077a6 100644
--- a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
+++ b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
@@ -1,6 +1,4 @@
-"""
-MetricsTracker class
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import math
 from collections import defaultdict
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 4a14be31..b54aa666 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Callable, Optional, Tuple, Union
 

From c8be0e8b7837ef4d31c8a8c33d9238b0516a5d15 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 27 Sep 2022 09:45:19 +0800
Subject: [PATCH 146/175] [to #44902165] remove device placement for image
 cartoon to avoid full gpu memory usage

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260495
---
 modelscope/pipelines/cv/image_cartoon_pipeline.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index 72fda989..787aa06d 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -37,15 +37,12 @@ class ImageCartoonPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        with device_placement(self.framework, self.device_name):
-            self.facer = FaceAna(self.model)
-            with tf.Graph().as_default():
-                self.sess_anime_head = self.load_sess(
-                    os.path.join(self.model, 'cartoon_h.pb'),
-                    'model_anime_head')
-                self.sess_anime_bg = self.load_sess(
-                    os.path.join(self.model, 'cartoon_bg.pb'),
-                    'model_anime_bg')
+        self.facer = FaceAna(self.model)
+        with tf.Graph().as_default():
+            self.sess_anime_head = self.load_sess(
+                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
+            self.sess_anime_bg = self.load_sess(
+                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))

From 26df8f198820c3c079e38c8fdb94c2fd4d836581 Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Tue, 27 Sep 2022 15:01:05 +0800
Subject: [PATCH 147/175] [to #42322933]add semantic-segmentation task output
 is numpy mask for demo-service         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10265856

---
 modelscope/models/cv/salient_detection/salient_model.py   | 3 ++-
 modelscope/outputs.py                                     | 6 ++++++
 .../pipelines/cv/image_salient_detection_pipeline.py      | 8 ++------
 modelscope/utils/constant.py                              | 1 +
 tests/pipelines/test_salient_detection.py                 | 5 ++---
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
index 6e617f58..73c3c3fb 100644
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -14,7 +14,8 @@ from modelscope.utils.constant import ModelFile, Tasks
 from .models import U2NET
 
 
-@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
+@MODELS.register_module(
+    Tasks.semantic_segmentation, module_name=Models.detection)
 class SalientDetection(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 052d4f33..b19f7e43 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -151,6 +151,12 @@ TASK_OUTPUTS = {
     Tasks.image_segmentation:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
 
+    # semantic segmentation result for single sample
+    #   {
+    #       "masks": [np.array # 2D array containing only 0, 255]
+    #   }
+    Tasks.semantic_segmentation: [OutputKeys.MASKS],
+
     # image matting result for single sample
     # {
     #   "output_img": np.array with shape(h, w, 4)
diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
index 433275ba..3b145cf0 100644
--- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -9,7 +9,7 @@ from modelscope.utils.constant import Tasks
 
 
 @PIPELINES.register_module(
-    Tasks.image_segmentation, module_name=Pipelines.salient_detection)
+    Tasks.semantic_segmentation, module_name=Pipelines.salient_detection)
 class ImageSalientDetectionPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -39,9 +39,5 @@ class ImageSalientDetectionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         data = self.model.postprocess(inputs)
-        outputs = {
-            OutputKeys.SCORES: None,
-            OutputKeys.LABELS: None,
-            OutputKeys.MASKS: data
-        }
+        outputs = {OutputKeys.MASKS: data}
         return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4c5d2f41..de3d933f 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_object_detection = 'image-object-detection'
 
     image_segmentation = 'image-segmentation'
+    semantic_segmentation = 'semantic-segmentation'
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index e87e9388..bcb904e6 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -11,17 +11,16 @@ from modelscope.utils.test_utils import test_level
 class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.image_segmentation
+        self.task = Tasks.semantic_segmentation
         self.model_id = 'damo/cv_u2net_salient-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_salient_detection(self):
         input_location = 'data/test/images/image_salient_detection.jpg'
         model_id = 'damo/cv_u2net_salient-detection'
-        salient_detect = pipeline(Tasks.image_segmentation, model=model_id)
+        salient_detect = pipeline(Tasks.semantic_segmentation, model=model_id)
         result = salient_detect(input_location)
         import cv2
-        # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')

From e90ff9e4795129eb8d64a2c4b67b3833217c7e1b Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:09:30 +0800
Subject: [PATCH 148/175] [to #42322933] tts sambert am changs from tensorfow
 to PyTorch and add licenses

    * [to #41669377] docs and tools refinement and release

1. add build_doc linter script
2. add sphinx-docs support
3. add development doc and api doc
4. change version to 0.1.0 for the first internal release version

Link: https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8775307
---
 .../models/audio/tts/models/__init__.py       |    9 -
 .../models/audio/tts/models/am_models.py      |  460 -------
 modelscope/models/audio/tts/models/compat.py  |   82 --
 .../tts/{text => models/datasets}/__init__.py |    0
 .../tts/models/datasets/kantts_data4fs.py     |  238 ++++
 .../audio/tts/models/datasets/samplers.py     |  131 ++
 .../tts/models/datasets/units/__init__.py     |    3 +
 .../tts/models/datasets/units/cleaners.py     |   88 ++
 .../tts/models/datasets/units/ling_unit.py    |  395 ++++++
 .../datasets/units}/numbers.py                |    3 +
 modelscope/models/audio/tts/models/fsmn.py    |  273 ----
 .../models/audio/tts/models/fsmn_encoder.py   |  178 ---
 modelscope/models/audio/tts/models/helpers.py |  159 ---
 .../audio/tts/models/models/__init__.py       |    0
 .../tts/models/models/hifigan/__init__.py     |    3 +
 .../tts/models/models/hifigan/hifigan.py      |  238 ++++
 .../tts/models/models/sambert/__init__.py     |    3 +
 .../tts/models/models/sambert/adaptors.py     |  131 ++
 .../audio/tts/models/models/sambert/base.py   |  369 ++++++
 .../audio/tts/models/models/sambert/fsmn.py   |  126 ++
 .../models/models/sambert/kantts_sambert.py   |  718 ++++++++++
 .../tts/models/models/sambert/positions.py    |  101 ++
 .../models/audio/tts/models/position.py       |  174 ---
 modelscope/models/audio/tts/models/reducer.py |  155 ---
 .../models/audio/tts/models/rnn_wrappers.py   |  237 ----
 .../models/audio/tts/models/robutrans.py      |  760 -----------
 .../tts/models/self_attention_decoder.py      |  817 ------------
 .../tts/models/self_attention_encoder.py      |  182 ---
 .../models/audio/tts/models/transformer.py    | 1157 -----------------
 modelscope/models/audio/tts/models/utils.py   |   59 -
 .../models/audio/tts/models/utils/__init__.py |    3 +
 .../models/audio/tts/models/utils/utils.py    |  136 ++
 .../models/audio/tts/models/vocoder_models.py |  516 --------
 modelscope/models/audio/tts/sambert_hifi.py   |   34 +-
 modelscope/models/audio/tts/text/cleaners.py  |   89 --
 modelscope/models/audio/tts/text/cmudict.py   |   64 -
 modelscope/models/audio/tts/text/symbols.py   |  105 --
 .../models/audio/tts/text/symbols_dict.py     |  200 ---
 modelscope/models/audio/tts/voice.py          |  333 ++---
 .../audio/text_to_speech_pipeline.py          |    5 +
 modelscope/utils/audio/tts_exceptions.py      |    3 +-
 requirements/audio.txt                        |    5 -
 tests/pipelines/test_text_to_speech.py        |    5 +-
 43 files changed, 2799 insertions(+), 5948 deletions(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/__init__.py
 delete mode 100755 modelscope/models/audio/tts/models/am_models.py
 delete mode 100755 modelscope/models/audio/tts/models/compat.py
 rename modelscope/models/audio/tts/{text => models/datasets}/__init__.py (100%)
 mode change 100755 => 100644
 create mode 100644 modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/samplers.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/cleaners.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/ling_unit.py
 rename modelscope/models/audio/tts/{text => models/datasets/units}/numbers.py (94%)
 mode change 100755 => 100644
 delete mode 100755 modelscope/models/audio/tts/models/fsmn.py
 delete mode 100755 modelscope/models/audio/tts/models/fsmn_encoder.py
 delete mode 100755 modelscope/models/audio/tts/models/helpers.py
 create mode 100644 modelscope/models/audio/tts/models/models/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/models/hifigan/__init__.py
 create mode 100755 modelscope/models/audio/tts/models/models/hifigan/hifigan.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/adaptors.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/base.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/fsmn.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/positions.py
 delete mode 100755 modelscope/models/audio/tts/models/position.py
 delete mode 100755 modelscope/models/audio/tts/models/reducer.py
 delete mode 100755 modelscope/models/audio/tts/models/rnn_wrappers.py
 delete mode 100755 modelscope/models/audio/tts/models/robutrans.py
 delete mode 100755 modelscope/models/audio/tts/models/self_attention_decoder.py
 delete mode 100755 modelscope/models/audio/tts/models/self_attention_encoder.py
 delete mode 100755 modelscope/models/audio/tts/models/transformer.py
 delete mode 100755 modelscope/models/audio/tts/models/utils.py
 create mode 100644 modelscope/models/audio/tts/models/utils/__init__.py
 create mode 100755 modelscope/models/audio/tts/models/utils/utils.py
 delete mode 100755 modelscope/models/audio/tts/models/vocoder_models.py
 delete mode 100755 modelscope/models/audio/tts/text/cleaners.py
 delete mode 100755 modelscope/models/audio/tts/text/cmudict.py
 delete mode 100644 modelscope/models/audio/tts/text/symbols.py
 delete mode 100644 modelscope/models/audio/tts/text/symbols_dict.py

diff --git a/modelscope/models/audio/tts/models/__init__.py b/modelscope/models/audio/tts/models/__init__.py
old mode 100755
new mode 100644
index c260d4fe..e69de29b
--- a/modelscope/models/audio/tts/models/__init__.py
+++ b/modelscope/models/audio/tts/models/__init__.py
@@ -1,9 +0,0 @@
-from .robutrans import RobuTrans
-from .vocoder_models import Generator
-
-
-def create_am_model(name, hparams):
-    if name == 'robutrans':
-        return RobuTrans(hparams)
-    else:
-        raise Exception('Unknown model: ' + name)
diff --git a/modelscope/models/audio/tts/models/am_models.py b/modelscope/models/audio/tts/models/am_models.py
deleted file mode 100755
index cd43ff12..00000000
--- a/modelscope/models/audio/tts/models/am_models.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import tensorflow as tf
-
-
-def encoder_prenet(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   dense_units,
-                   is_training,
-                   mask=None,
-                   scope='encoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def decoder_prenet(inputs,
-                   prenet_units,
-                   dense_units,
-                   is_training,
-                   scope='decoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def encoder(inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=None,
-            scope='encoder'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(
-            inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=mask)
-    return x
-
-
-def prenet(inputs, prenet_units, is_training, scope='prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-    return x
-
-
-def postnet_residual_ulstm(inputs,
-                           n_conv_layers,
-                           filters,
-                           kernel_size,
-                           lstm_units,
-                           output_units,
-                           is_training,
-                           scope='postnet_residual_ulstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_residual_lstm(inputs,
-                          n_conv_layers,
-                          filters,
-                          kernel_size,
-                          lstm_units,
-                          output_units,
-                          is_training,
-                          scope='postnet_residual_lstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
-                          lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_linear_ulstm(inputs,
-                         n_conv_layers,
-                         filters,
-                         kernel_size,
-                         lstm_units,
-                         output_units,
-                         is_training,
-                         scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear_lstm(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        lstm_units,
-                        output_units,
-                        output_lengths,
-                        is_training,
-                        embedded_inputs_speaker2,
-                        mask=None,
-                        scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   output_units,
-                   output_lengths,
-                   is_training,
-                   embedded_inputs_speaker2,
-                   mask=None,
-                   scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-    return x
-
-
-def conv_and_lstm(inputs,
-                  sequence_lengths,
-                  n_conv_layers,
-                  filters,
-                  kernel_size,
-                  lstm_units,
-                  is_training,
-                  embedded_inputs_speaker,
-                  mask=None,
-                  scope='conv_and_lstm'):
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-    return x
-
-
-def conv_and_lstm_dec(inputs,
-                      sequence_lengths,
-                      n_conv_layers,
-                      filters,
-                      kernel_size,
-                      lstm_units,
-                      is_training,
-                      embedded_inputs_speaker2,
-                      mask=None,
-                      scope='conv_and_lstm'):
-    x = inputs
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-    return x
-
-
-def conv_dec(inputs,
-             sequence_lengths,
-             n_conv_layers,
-             filters,
-             kernel_size,
-             lstm_units,
-             is_training,
-             embedded_inputs_speaker2,
-             mask=None,
-             scope='conv_and_lstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-    return x
-
-
-def conv_and_ulstm(inputs,
-                   sequence_lengths,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   is_training,
-                   scope='conv_and_ulstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                scope='conv1d_{}'.format(i))
-
-        outputs, states = tf.nn.dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-
-    return outputs
-
-
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           is_training,
-           activation=None,
-           dropout=False,
-           mask=None,
-           scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.layers.batch_normalization(x, training=is_training)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=0.5, training=is_training)
-    return x
-
-
-def conv1d_dp(inputs,
-              filters,
-              kernel_size,
-              is_training,
-              activation=None,
-              dropout=False,
-              dropoutrate=0.5,
-              mask=None,
-              scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.contrib.layers.layer_norm(x)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
-    return x
-
-
-def duration_predictor(inputs,
-                       n_conv_layers,
-                       filters,
-                       kernel_size,
-                       lstm_units,
-                       input_lengths,
-                       is_training,
-                       embedded_inputs_speaker,
-                       mask=None,
-                       scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=input_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def duration_predictor2(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        input_lengths,
-                        is_training,
-                        mask=None,
-                        scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def conv_prenet(inputs,
-                n_conv_layers,
-                filters,
-                kernel_size,
-                is_training,
-                mask=None,
-                scope='conv_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-    return x
diff --git a/modelscope/models/audio/tts/models/compat.py b/modelscope/models/audio/tts/models/compat.py
deleted file mode 100755
index bb810841..00000000
--- a/modelscope/models/audio/tts/models/compat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Functions for compatibility with different TensorFlow versions."""
-
-import tensorflow as tf
-
-
-def is_tf2():
-    """Returns ``True`` if running TensorFlow 2.0."""
-    return tf.__version__.startswith('2')
-
-
-def tf_supports(symbol):
-    """Returns ``True`` if TensorFlow defines :obj:`symbol`."""
-    return _string_to_tf_symbol(symbol) is not None
-
-
-def tf_any(*symbols):
-    """Returns the first supported symbol."""
-    for symbol in symbols:
-        module = _string_to_tf_symbol(symbol)
-        if module is not None:
-            return module
-    return None
-
-
-def tf_compat(v2=None, v1=None):  # pylint: disable=invalid-name
-    """Returns the compatible symbol based on the current TensorFlow version.
-
-    Args:
-      v2: The candidate v2 symbol name.
-      v1: The candidate v1 symbol name.
-
-    Returns:
-      A TensorFlow symbol.
-
-    Raises:
-      ValueError: if no symbol can be found.
-    """
-    candidates = []
-    if v2 is not None:
-        candidates.append(v2)
-    if v1 is not None:
-        candidates.append(v1)
-        candidates.append('compat.v1.%s' % v1)
-    symbol = tf_any(*candidates)
-    if symbol is None:
-        raise ValueError('Failure to resolve the TensorFlow symbol')
-    return symbol
-
-
-def name_from_variable_scope(name=''):
-    """Creates a name prefixed by the current variable scope."""
-    var_scope = tf_compat(v1='get_variable_scope')().name
-    compat_name = ''
-    if name:
-        compat_name = '%s/' % name
-    if var_scope:
-        compat_name = '%s/%s' % (var_scope, compat_name)
-    return compat_name
-
-
-def reuse():
-    """Returns ``True`` if the current variable scope is marked for reuse."""
-    return tf_compat(v1='get_variable_scope')().reuse
-
-
-def _string_to_tf_symbol(symbol):
-    modules = symbol.split('.')
-    namespace = tf
-    for module in modules:
-        namespace = getattr(namespace, module, None)
-        if namespace is None:
-            return None
-    return namespace
-
-
-# pylint: disable=invalid-name
-gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
-gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
-gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
-is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
-logging = tf_compat(v1='logging')
-nest = tf_compat(v2='nest', v1='contrib.framework.nest')
diff --git a/modelscope/models/audio/tts/text/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
similarity index 100%
rename from modelscope/models/audio/tts/text/__init__.py
rename to modelscope/models/audio/tts/models/datasets/__init__.py
diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
new file mode 100644
index 00000000..cc47d0c4
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from modelscope.utils.logger import get_logger
+from .units import KanTtsLinguisticUnit
+
+logger = get_logger()
+
+
+class KanTtsText2MelDataset(Dataset):
+
+    def __init__(self, metadata_filename, config_filename, cache=False):
+        super(KanTtsText2MelDataset, self).__init__()
+
+        self.cache = cache
+
+        with open(config_filename) as f:
+            self._config = json.loads(f.read())
+
+        # Load metadata:
+        self._datadir = os.path.dirname(metadata_filename)
+        with open(metadata_filename, encoding='utf-8') as f:
+            self._metadata = [line.strip().split('|') for line in f]
+            self._length_lst = [int(x[2]) for x in self._metadata]
+            hours = sum(
+                self._length_lst) * self._config['audio']['frame_shift_ms'] / (
+                    3600 * 1000)
+
+            logger.info('Loaded metadata for %d examples (%.2f hours)' %
+                        (len(self._metadata), hours))
+            logger.info('Minimum length: %d, Maximum length: %d' %
+                        (min(self._length_lst), max(self._length_lst)))
+
+        self.ling_unit = KanTtsLinguisticUnit(config_filename)
+        self.pad_executor = KanTtsText2MelPad()
+
+        self.r = self._config['am']['outputs_per_step']
+        self.num_mels = self._config['am']['num_mels']
+
+        if 'adv' in self._config:
+            self.feat_window = self._config['adv']['random_window']
+        else:
+            self.feat_window = None
+        logger.info(self.feat_window)
+
+        self.data_cache = [
+            self.cache_load(i) for i in tqdm(range(self.__len__()))
+        ] if self.cache else []
+
+    def get_frames_lst(self):
+        return self._length_lst
+
+    def __getitem__(self, index):
+        if self.cache:
+            sample = self.data_cache[index]
+            return sample
+
+        return self.cache_load(index)
+
+    def cache_load(self, index):
+        sample = {}
+
+        meta = self._metadata[index]
+
+        sample['utt_id'] = meta[0]
+
+        sample['mel_target'] = np.load(os.path.join(
+            self._datadir, meta[1]))[:, :self.num_mels]
+        sample['output_length'] = len(sample['mel_target'])
+
+        lfeat_symbol = meta[3]
+        sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol)
+
+        sample['duration'] = np.load(os.path.join(self._datadir, meta[4]))
+
+        sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5]))
+
+        sample['energy_contour'] = np.load(
+            os.path.join(self._datadir, meta[6]))
+
+        return sample
+
+    def __len__(self):
+        return len(self._metadata)
+
+    def collate_fn(self, batch):
+        data_dict = {}
+
+        max_input_length = max((len(x['ling'][0]) for x in batch))
+
+        # pure linguistic info: sy|tone|syllable_flag|word_segment
+
+        # sy
+        lfeat_type = self.ling_unit._lfeat_type_list[0]
+        inputs_sy = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][0] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+        # tone
+        lfeat_type = self.ling_unit._lfeat_type_list[1]
+        inputs_tone = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][1] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # syllable_flag
+        lfeat_type = self.ling_unit._lfeat_type_list[2]
+        inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][2] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # word_segment
+        lfeat_type = self.ling_unit._lfeat_type_list[3]
+        inputs_ws = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][3] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # emotion category
+        lfeat_type = self.ling_unit._lfeat_type_list[4]
+        data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][4] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # speaker category
+        lfeat_type = self.ling_unit._lfeat_type_list[5]
+        data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][5] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        data_dict['input_lings'] = torch.stack(
+            [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2)
+
+        data_dict['valid_input_lengths'] = torch.as_tensor(
+            [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long
+        )  # There is one '~' in the last of symbol sequence. We put length-1 for calculation.
+
+        data_dict['valid_output_lengths'] = torch.as_tensor(
+            [x['output_length'] for x in batch], dtype=torch.long)
+        max_output_length = torch.max(data_dict['valid_output_lengths']).item()
+        max_output_round_length = self.pad_executor._round_up(
+            max_output_length, self.r)
+
+        if self.feat_window is not None:
+            active_feat_len = np.minimum(max_output_round_length,
+                                         self.feat_window)
+            if active_feat_len < self.feat_window:
+                max_output_round_length = self.pad_executor._round_up(
+                    self.feat_window, self.r)
+                active_feat_len = self.feat_window
+
+            max_offsets = [x['output_length'] - active_feat_len for x in batch]
+            feat_offsets = [
+                np.random.randint(0, np.maximum(1, offset))
+                for offset in max_offsets
+            ]
+            feat_offsets = torch.from_numpy(
+                np.asarray(feat_offsets, dtype=np.int32)).long()
+            data_dict['feat_offsets'] = feat_offsets
+
+        data_dict['mel_targets'] = self.pad_executor._prepare_targets(
+            [x['mel_target'] for x in batch], max_output_round_length, 0.0)
+        data_dict['durations'] = self.pad_executor._prepare_durations(
+            [x['duration'] for x in batch], max_input_length,
+            max_output_round_length)
+
+        data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs(
+            [x['pitch_contour'] for x in batch], max_input_length,
+            0.0).float()
+        data_dict[
+            'energy_contours'] = self.pad_executor._prepare_scalar_inputs(
+                [x['energy_contour'] for x in batch], max_input_length,
+                0.0).float()
+
+        data_dict['utt_ids'] = [x['utt_id'] for x in batch]
+
+        return data_dict
+
+
+class KanTtsText2MelPad(object):
+
+    def __init__(self):
+        super(KanTtsText2MelPad, self).__init__()
+        pass
+
+    def _pad1D(self, x, length, pad):
+        return np.pad(
+            x, (0, length - x.shape[0]), mode='constant', constant_values=pad)
+
+    def _pad2D(self, x, length, pad):
+        return np.pad(
+            x, [(0, length - x.shape[0]), (0, 0)],
+            mode='constant',
+            constant_values=pad)
+
+    def _pad_durations(self, duration, max_in_len, max_out_len):
+        framenum = np.sum(duration)
+        symbolnum = duration.shape[0]
+        if framenum < max_out_len:
+            padframenum = max_out_len - framenum
+            duration = np.insert(
+                duration, symbolnum, values=padframenum, axis=0)
+            duration = np.insert(
+                duration,
+                symbolnum + 1,
+                values=[0] * (max_in_len - symbolnum - 1),
+                axis=0)
+        else:
+            if symbolnum < max_in_len:
+                duration = np.insert(
+                    duration,
+                    symbolnum,
+                    values=[0] * (max_in_len - symbolnum),
+                    axis=0)
+        return duration
+
+    def _round_up(self, x, multiple):
+        remainder = x % multiple
+        return x if remainder == 0 else x + multiple - remainder
+
+    def _prepare_scalar_inputs(self, inputs, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad1D(x, max_len, pad) for x in inputs]))
+
+    def _prepare_targets(self, targets, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad2D(t, max_len, pad) for t in targets])).float()
+
+    def _prepare_durations(self, durations, max_in_len, max_out_len):
+        return torch.from_numpy(
+            np.stack([
+                self._pad_durations(t, max_in_len, max_out_len)
+                for t in durations
+            ])).long()
diff --git a/modelscope/models/audio/tts/models/datasets/samplers.py b/modelscope/models/audio/tts/models/datasets/samplers.py
new file mode 100644
index 00000000..0657fa8a
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/samplers.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import random
+
+import torch
+from torch import distributed as dist
+from torch.utils.data import Sampler
+
+
+class LenSortGroupPoolSampler(Sampler):
+
+    def __init__(self, data_source, length_lst, group_size):
+        super(LenSortGroupPoolSampler, self).__init__(data_source)
+
+        self.data_source = data_source
+        self.length_lst = length_lst
+        self.group_size = group_size
+
+        self.num = len(self.length_lst)
+        self.buckets = self.num // group_size
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        random_lst = torch.randperm(self.num).tolist()
+        random_len_lst = [(i, self.length_lst[i]) for i in random_lst]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num, self.group_size)
+        ]
+        if (self.num % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                indices.append(item[0])
+
+        return iter(indices)
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class DistributedLenSortGroupPoolSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 length_lst,
+                 group_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True):
+        super(DistributedLenSortGroupPoolSampler, self).__init__(dataset)
+
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.length_lst = length_lst
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.buckets = self.num_samples // group_size
+        self.shuffle = shuffle
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        random_len_lst = [(i, self.length_lst[i]) for i in indices]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num_samples, self.group_size)
+        ]
+        if (self.num_samples % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        new_indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                new_indices.append(item[0])
+
+        return iter(new_indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/modelscope/models/audio/tts/models/datasets/units/__init__.py b/modelscope/models/audio/tts/models/datasets/units/__init__.py
new file mode 100644
index 00000000..4d03df04
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .ling_unit import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/datasets/units/cleaners.py b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
new file mode 100644
index 00000000..07d4fbdb
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
@@ -0,0 +1,88 @@
+# from https://github.com/keithito/tacotron
+# Cleaners are transformations that run over the input text at both training and eval time.
+#
+# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+# hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+#   1. "english_cleaners" for English text
+#   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+#      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+#   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+#      the symbols in symbols.py to match your data).
+
+import re
+
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+    for x in [('mrs', 'misess'),
+              ('mr', 'mister'),
+              ('dr', 'doctor'),
+              ('st', 'saint'),
+              ('co', 'company'),
+              ('jr', 'junior'),
+              ('maj', 'major'),
+              ('gen', 'general'),
+              ('drs', 'doctors'),
+              ('rev', 'reverend'),
+              ('lt', 'lieutenant'),
+              ('hon', 'honorable'),
+              ('sgt', 'sergeant'),
+              ('capt', 'captain'),
+              ('esq', 'esquire'),
+              ('ltd', 'limited'),
+              ('col', 'colonel'),
+              ('ft', 'fort'), ]]  # yapf:disable
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
diff --git a/modelscope/models/audio/tts/models/datasets/units/ling_unit.py b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
new file mode 100644
index 00000000..3c211cc7
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
@@ -0,0 +1,395 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import abc
+import codecs
+import os
+import re
+import shutil
+
+import json
+import numpy as np
+
+from . import cleaners as cleaners
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception(
+                'modelscope error: configuration cleaner unknown: %s' % name)
+        text = cleaner(text)
+    return text
+
+
+class LinguisticBaseUnit(abc.ABC):
+
+    def set_config_params(self, config_params):
+        self.config_params = config_params
+
+    def save(self, config, config_name, path):
+        t_path = os.path.join(path, config_name)
+        if config != t_path:
+            os.makedirs(path, exist_ok=True)
+            shutil.copyfile(config, os.path.join(path, config_name))
+
+
+class KanTtsLinguisticUnit(LinguisticBaseUnit):
+
+    def __init__(self, config, path, has_mask=True):
+        super(KanTtsLinguisticUnit, self).__init__()
+
+        # special symbol
+        self._pad = '_'
+        self._eos = '~'
+        self._mask = '@[MASK]'
+        self._has_mask = has_mask
+        self._unit_config = config
+        self._path = path
+
+        self._cleaner_names = [
+            x.strip() for x in self._unit_config['cleaners'].split(',')
+        ]
+        self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip(
+        ).split(',')
+
+        self.build()
+
+    def get_unit_size(self):
+        ling_unit_size = {}
+        ling_unit_size['sy'] = len(self.sy)
+        ling_unit_size['tone'] = len(self.tone)
+        ling_unit_size['syllable_flag'] = len(self.syllable_flag)
+        ling_unit_size['word_segment'] = len(self.word_segment)
+
+        if 'emo_category' in self._lfeat_type_list:
+            ling_unit_size['emotion'] = len(self.emo_category)
+        if 'speaker_category' in self._lfeat_type_list:
+            ling_unit_size['speaker'] = len(self.speaker)
+
+        return ling_unit_size
+
+    def build(self):
+
+        self._sub_unit_dim = {}
+        self._sub_unit_pad = {}
+        # sy sub-unit
+        _characters = ''
+
+        _ch_symbols = []
+
+        sy_path = os.path.join(self._path, self._unit_config['sy'])
+        f = codecs.open(sy_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_symbols.append(line)
+
+        _arpabet = ['@' + s for s in _ch_symbols]
+
+        # Export all symbols:
+        self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
+        if self._has_mask:
+            self.sy.append(self._mask)
+        self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
+        self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
+        self._sub_unit_dim['sy'] = len(self.sy)
+        self._sub_unit_pad['sy'] = self._sy_to_id['_']
+
+        # tone sub-unit
+        _characters = ''
+
+        _ch_tones = []
+
+        tone_path = os.path.join(self._path, self._unit_config['tone'])
+        f = codecs.open(tone_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_tones.append(line)
+
+        # Export all tones:
+        self.tone = list(_characters) + _ch_tones + [self._pad, self._eos]
+        if self._has_mask:
+            self.tone.append(self._mask)
+        self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
+        self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
+        self._sub_unit_dim['tone'] = len(self.tone)
+        self._sub_unit_pad['tone'] = self._tone_to_id['_']
+
+        # syllable flag sub-unit
+        _characters = ''
+
+        _ch_syllable_flags = []
+
+        sy_flag_path = os.path.join(self._path,
+                                    self._unit_config['syllable_flag'])
+        f = codecs.open(sy_flag_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_syllable_flags.append(line)
+
+        # Export all syllable_flags:
+        self.syllable_flag = list(_characters) + _ch_syllable_flags + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.syllable_flag.append(self._mask)
+        self._syllable_flag_to_id = {
+            s: i
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._id_to_syllable_flag = {
+            i: s
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
+        self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_']
+
+        # word segment sub-unit
+        _characters = ''
+
+        _ch_word_segments = []
+
+        ws_path = os.path.join(self._path, self._unit_config['word_segment'])
+        f = codecs.open(ws_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_word_segments.append(line)
+
+        # Export all syllable_flags:
+        self.word_segment = list(_characters) + _ch_word_segments + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.word_segment.append(self._mask)
+        self._word_segment_to_id = {
+            s: i
+            for i, s in enumerate(self.word_segment)
+        }
+        self._id_to_word_segment = {
+            i: s
+            for i, s in enumerate(self.word_segment)
+        }
+        self._sub_unit_dim['word_segment'] = len(self.word_segment)
+        self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
+
+        if 'emo_category' in self._lfeat_type_list:
+            # emotion category sub-unit
+            _characters = ''
+
+            _ch_emo_types = []
+
+            emo_path = os.path.join(self._path,
+                                    self._unit_config['emo_category'])
+            f = codecs.open(emo_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_emo_types.append(line)
+
+            self.emo_category = list(_characters) + _ch_emo_types + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.emo_category.append(self._mask)
+            self._emo_category_to_id = {
+                s: i
+                for i, s in enumerate(self.emo_category)
+            }
+            self._id_to_emo_category = {
+                i: s
+                for i, s in enumerate(self.emo_category)
+            }
+            self._sub_unit_dim['emo_category'] = len(self.emo_category)
+            self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_']
+
+        if 'speaker_category' in self._lfeat_type_list:
+            # speaker category sub-unit
+            _characters = ''
+
+            _ch_speakers = []
+
+            speaker_path = os.path.join(self._path,
+                                        self._unit_config['speaker_category'])
+            f = codecs.open(speaker_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_speakers.append(line)
+
+            # Export all syllable_flags:
+            self.speaker = list(_characters) + _ch_speakers + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.speaker.append(self._mask)
+            self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)}
+            self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)}
+            self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id)
+            self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_']
+
+    def encode_symbol_sequence(self, lfeat_symbol):
+        lfeat_symbol = lfeat_symbol.strip().split(' ')
+
+        lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
+        for this_lfeat_symbol in lfeat_symbol:
+            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
+                '$')
+            index = 0
+            while index < len(lfeat_symbol_separate):
+                lfeat_symbol_separate[index] = lfeat_symbol_separate[
+                    index] + this_lfeat_symbol[index] + ' '
+                index = index + 1
+
+        input_and_label_data = []
+        index = 0
+        while index < len(self._lfeat_type_list):
+            sequence = self.encode_sub_unit(
+                lfeat_symbol_separate[index].strip(),
+                self._lfeat_type_list[index])
+            sequence_array = np.asarray(sequence, dtype=np.int32)
+            input_and_label_data.append(sequence_array)
+            index = index + 1
+
+        return input_and_label_data
+
+    def decode_symbol_sequence(self, sequence):
+        result = []
+        for i, lfeat_type in enumerate(self._lfeat_type_list):
+            s = ''
+            sequence_item = sequence[i].tolist()
+            if lfeat_type == 'sy':
+                s = self.decode_sy(sequence_item)
+            elif lfeat_type == 'tone':
+                s = self.decode_tone(sequence_item)
+            elif lfeat_type == 'syllable_flag':
+                s = self.decode_syllable_flag(sequence_item)
+            elif lfeat_type == 'word_segment':
+                s = self.decode_word_segment(sequence_item)
+            elif lfeat_type == 'emo_category':
+                s = self.decode_emo_category(sequence_item)
+            elif lfeat_type == 'speaker_category':
+                s = self.decode_speaker_category(sequence_item)
+            else:
+                raise Exception(
+                    'modelscope error: configuration lfeat type(%s) unknown.'
+                    % lfeat_type)
+            result.append('%s:%s' % (lfeat_type, s))
+
+        return result
+
+    def encode_sub_unit(self, this_lfeat_symbol, lfeat_type):
+        sequence = []
+        if lfeat_type == 'sy':
+            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
+            this_lfeat_symbol_format = ''
+            index = 0
+            while index < len(this_lfeat_symbol):
+                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
+                    index] + '}' + ' '
+                index = index + 1
+            sequence = self.encode_text(this_lfeat_symbol_format,
+                                        self._cleaner_names)
+        elif lfeat_type == 'tone':
+            sequence = self.encode_tone(this_lfeat_symbol)
+        elif lfeat_type == 'syllable_flag':
+            sequence = self.encode_syllable_flag(this_lfeat_symbol)
+        elif lfeat_type == 'word_segment':
+            sequence = self.encode_word_segment(this_lfeat_symbol)
+        elif lfeat_type == 'emo_category':
+            sequence = self.encode_emo_category(this_lfeat_symbol)
+        elif lfeat_type == 'speaker_category':
+            sequence = self.encode_speaker_category(this_lfeat_symbol)
+        else:
+            raise Exception(
+                'modelscope error: configuration lfeat type(%s) unknown.'
+                % lfeat_type)
+
+        return sequence
+
+    def encode_text(self, text, cleaner_names):
+        sequence = []
+
+        # Check for curly braces and treat their contents as ARPAbet:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.encode_sy(_clean_text(text, cleaner_names))
+                break
+            sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names))
+            sequence += self.encode_arpanet(m.group(2))
+            text = m.group(3)
+
+        # Append EOS token
+        sequence.append(self._sy_to_id['~'])
+        return sequence
+
+    def encode_sy(self, sy):
+        return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)]
+
+    def decode_sy(self, id):
+        s = self._id_to_sy[id]
+        if len(s) > 1 and s[0] == '@':
+            s = s[1:]
+        return s
+
+    def should_keep_sy(self, s):
+        return s in self._sy_to_id and s != '_' and s != '~'
+
+    def encode_arpanet(self, text):
+        return self.encode_sy(['@' + s for s in text.split()])
+
+    def encode_tone(self, tone):
+        tones = tone.strip().split(' ')
+        sequence = []
+        for this_tone in tones:
+            sequence.append(self._tone_to_id[this_tone])
+        sequence.append(self._tone_to_id['~'])
+        return sequence
+
+    def decode_tone(self, id):
+        return self._id_to_tone[id]
+
+    def encode_syllable_flag(self, syllable_flag):
+        syllable_flags = syllable_flag.strip().split(' ')
+        sequence = []
+        for this_syllable_flag in syllable_flags:
+            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
+        sequence.append(self._syllable_flag_to_id['~'])
+        return sequence
+
+    def decode_syllable_flag(self, id):
+        return self._id_to_syllable_flag[id]
+
+    def encode_word_segment(self, word_segment):
+        word_segments = word_segment.strip().split(' ')
+        sequence = []
+        for this_word_segment in word_segments:
+            sequence.append(self._word_segment_to_id[this_word_segment])
+        sequence.append(self._word_segment_to_id['~'])
+        return sequence
+
+    def decode_word_segment(self, id):
+        return self._id_to_word_segment[id]
+
+    def encode_emo_category(self, emo_type):
+        emo_categories = emo_type.strip().split(' ')
+        sequence = []
+        for this_category in emo_categories:
+            sequence.append(self._emo_category_to_id[this_category])
+        sequence.append(self._emo_category_to_id['~'])
+        return sequence
+
+    def decode_emo_category(self, id):
+        return self._id_to_emo_category[id]
+
+    def encode_speaker_category(self, speaker):
+        speakers = speaker.strip().split(' ')
+        sequence = []
+        for this_speaker in speakers:
+            sequence.append(self._speaker_to_id[this_speaker])
+        sequence.append(self._speaker_to_id['~'])
+        return sequence
+
+    def decode_speaker_category(self, id):
+        return self._id_to_speaker[id]
diff --git a/modelscope/models/audio/tts/text/numbers.py b/modelscope/models/audio/tts/models/datasets/units/numbers.py
old mode 100755
new mode 100644
similarity index 94%
rename from modelscope/models/audio/tts/text/numbers.py
rename to modelscope/models/audio/tts/models/datasets/units/numbers.py
index d9453fee..d8835059
--- a/modelscope/models/audio/tts/text/numbers.py
+++ b/modelscope/models/audio/tts/models/datasets/units/numbers.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from tacotron,
+# made publicly available under the MIT License at https://github.com/keithito/tacotron
+
 import re
 
 import inflect
diff --git a/modelscope/models/audio/tts/models/fsmn.py b/modelscope/models/audio/tts/models/fsmn.py
deleted file mode 100755
index 875c27f0..00000000
--- a/modelscope/models/audio/tts/models/fsmn.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import tensorflow as tf
-
-
-def build_sequence_mask(sequence_length,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-
-    return mask
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def pad_in_time(x, padding_shape):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def pad_in_time_right(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2
-
-    Args:
-      x: The input.
-      ffn_dim: The number of units of the nonlinear transformation.
-      memory_units: the number of units of linear transformation
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(
-        inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
-    outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)
-
-    return outer
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.0):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-
-    return outputs
-
-
-def MemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    inputs = tf.expand_dims(inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    output = tf.reshape(
-        output,
-        [tf.shape(output)[0], tf.shape(output)[2], depth])
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def MemoryBlockV2(
-    inputs,
-    filter_size,
-    mode,
-    shift=0,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      shift: left padding, to control delay
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if mask is not None:
-        inputs = inputs * tf.expand_dims(mask, -1)
-
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    # padding
-    left_padding = int(round((filter_size - 1) / 2))
-    right_padding = int((filter_size - 1) / 2)
-    if shift > 0:
-        left_padding = left_padding + shift
-        right_padding = right_padding - shift
-    pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
-    pad_inputs = tf.expand_dims(
-        pad_inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=pad_inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def UniMemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    cache=None,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the unidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      cache: for streaming inference
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-      dropout: dorpout factor
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if cache is not None:
-        static_shape = cache['queries'].get_shape().as_list()
-        depth = static_shape[-1]
-        queries = tf.slice(cache['queries'], [0, 1, 0], [
-            tf.shape(cache['queries'])[0],
-            tf.shape(cache['queries'])[1] - 1, depth
-        ])
-        queries = tf.concat([queries, inputs], axis=1)
-        cache['queries'] = queries
-    else:
-        padding_length = filter_size - 1
-        queries = pad_in_time(inputs, [padding_length, 0])
-
-    queries = tf.expand_dims(queries, axis=1)  # [Batch, 1, Time, Frequency]
-    static_shape = queries.get_shape().as_list()
-    depth = static_shape[-1]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=queries,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
diff --git a/modelscope/models/audio/tts/models/fsmn_encoder.py b/modelscope/models/audio/tts/models/fsmn_encoder.py
deleted file mode 100755
index 2c650624..00000000
--- a/modelscope/models/audio/tts/models/fsmn_encoder.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import tensorflow as tf
-
-from . import fsmn
-
-
-class FsmnEncoder():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoder, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlock(
-                        context,
-                        self.filter_size,
-                        mode,
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
-
-
-class FsmnEncoderV2():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 shift=0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          shift: left padding, to control delay
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoderV2, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.shift = shift
-        if not isinstance(shift, list):
-            self.shift = [shift for _ in range(self.fsmn_num_layers)]
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlockV2(
-                        context,
-                        self.filter_size,
-                        mode,
-                        shift=self.shift[layer],
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
diff --git a/modelscope/models/audio/tts/models/helpers.py b/modelscope/models/audio/tts/models/helpers.py
deleted file mode 100755
index 371000a4..00000000
--- a/modelscope/models/audio/tts/models/helpers.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-class VarTestHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, batch_size, inputs, dim):
-        with tf.name_scope('VarTestHelper'):
-            self._batch_size = batch_size
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._inputs)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope('VarTestHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
-                                    axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim):
-        with tf.name_scope('VarTrainingHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat(
-                [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingSSHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
-                 alpha, decay_steps):
-        with tf.name_scope('VarTrainingSSHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-            # for schedule sampling
-            self._global_step = global_step
-            self._schedule_begin = schedule_begin
-            self._alpha = alpha
-            self._decay_steps = decay_steps
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        self._ratio = _tf_decay(self._global_step, self._schedule_begin,
-                                self._alpha, self._decay_steps)
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs_tmp = tf.cond(
-                tf.less(
-                    tf.random_uniform([], minval=0, maxval=1,
-                                      dtype=tf.float32), self._ratio),
-                lambda: self._targets[:, time, :], lambda: outputs)
-            next_inputs = tf.concat(
-                [next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-def _go_frames(batch_size, dim, init_inputs):
-    '''Returns all-zero <GO> frames for a given batch size and output dimension'''
-    return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
-                     axis=-1)
-
-
-def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
-    tfr = tf.train.exponential_decay(
-        1.0,
-        global_step=global_step - schedule_begin,
-        decay_steps=decay_steps,
-        decay_rate=alpha,
-        name='tfr_decay')
-    final_tfr = tf.cond(
-        tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
-    return final_tfr
diff --git a/modelscope/models/audio/tts/models/models/__init__.py b/modelscope/models/audio/tts/models/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/tts/models/models/hifigan/__init__.py b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
new file mode 100644
index 00000000..ae9d10ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .hifigan import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/hifigan/hifigan.py b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
new file mode 100755
index 00000000..0f950539
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from https://github.com/jik876/hifi-gan
+
+from distutils.version import LooseVersion
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from modelscope.models.audio.tts.models.utils import get_padding, init_weights
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+
+    Returns:
+        Tensor: Magnitude spectrogram (B).
+
+    """
+    if is_pytorch_17plus:
+        x_stft = torch.stft(
+            x, fft_size, hop_size, win_length, window, return_complex=False)
+    else:
+        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
+
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding_casual(kernel_size, dilation=1):
+    return int(kernel_size * dilation - dilation)
+
+
+class Conv1dCasual(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super(Conv1dCasual, self).__init__()
+        self.pad = padding
+        self.conv1d = weight_norm(
+            Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=0,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+                padding_mode=padding_mode))
+        self.conv1d.apply(init_weights)
+
+    def forward(self, x):  # bdt
+        # described starting from the last dimension and moving forward.
+        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
+        x = self.conv1d(x)
+        return x
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv1d)
+
+
+class ConvTranspose1dCausal(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        """Initialize CausalConvTranspose1d module."""
+        super(ConvTranspose1dCausal, self).__init__()
+        self.deconv = weight_norm(
+            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
+        self.stride = stride
+        self.deconv.apply(init_weights)
+        self.pad = kernel_size - stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
+        return self.deconv(x)[:, :, :-self.pad]
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.deconv)
+
+
+class ResBlock1(torch.nn.Module):
+
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=dilation[i],
+                padding=get_padding_casual(kernel_size, dilation[i]))
+            for i in range(len(dilation))
+        ])
+
+        self.convs2 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=1,
+                padding=get_padding_casual(kernel_size, 1))
+            for i in range(len(dilation))
+        ])
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            layer.remove_weight_norm()
+        for layer in self.convs2:
+            layer.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        logger.info('num_kernels={}, num_upsamples={}'.format(
+            self.num_kernels, self.num_upsamples))
+        self.conv_pre = Conv1dCasual(
+            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        self.repeat_ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            upsample = nn.Sequential(
+                nn.Upsample(mode='nearest', scale_factor=u),
+                nn.LeakyReLU(LRELU_SLOPE),
+                Conv1dCasual(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    kernel_size=7,
+                    stride=1,
+                    padding=7 - 1))
+            self.repeat_ups.append(upsample)
+            self.ups.append(
+                ConvTranspose1dCausal(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = torch.sin(x) + x
+            # transconv
+            x1 = F.leaky_relu(x, LRELU_SLOPE)
+            x1 = self.ups[i](x1)
+            # repeat
+            x2 = self.repeat_ups[i](x)
+            x = x1 + x2
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        logger.info('Removing weight norm...')
+        for layer in self.ups:
+            layer.remove_weight_norm()
+        for layer in self.repeat_ups:
+            layer[-1].remove_weight_norm()
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        self.conv_pre.remove_weight_norm()
+        self.conv_post.remove_weight_norm()
diff --git a/modelscope/models/audio/tts/models/models/sambert/__init__.py b/modelscope/models/audio/tts/models/models/sambert/__init__.py
new file mode 100644
index 00000000..f0bf5290
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .kantts_sambert import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/sambert/adaptors.py b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
new file mode 100644
index 00000000..c171a1db
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base import Prenet
+from .fsmn import FsmnEncoderV2
+
+
+class LengthRegulator(nn.Module):
+
+    def __init__(self, r=1):
+        super(LengthRegulator, self).__init__()
+
+        self.r = r
+
+    def forward(self, inputs, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        out = torch.matmul(mult, inputs)
+
+        if masks is not None:
+            out = out.masked_fill(masks.unsqueeze(-1), 0.0)
+
+        seq_len = out.size(1)
+        padding = self.r - int(seq_len) % self.r
+        if (padding < self.r):
+            out = F.pad(
+                out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
+            out = out.transpose(1, 2)
+
+        return out, output_lens
+
+
+class VarRnnARPredictor(nn.Module):
+
+    def __init__(self, cond_units, prenet_units, rnn_units):
+        super(VarRnnARPredictor, self).__init__()
+
+        self.prenet = Prenet(1, prenet_units)
+        self.lstm = nn.LSTM(
+            prenet_units[-1] + cond_units,
+            rnn_units,
+            num_layers=2,
+            batch_first=True,
+            bidirectional=False)
+        self.fc = nn.Linear(rnn_units, 1)
+
+    def forward(self, inputs, cond, h=None, masks=None):
+        x = torch.cat([self.prenet(inputs), cond], dim=-1)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simplicity due to the mask and uni-directional lstm.
+        x, h_new = self.lstm(x, h)
+
+        x = self.fc(x).squeeze(-1)
+        x = F.relu(x)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x, h_new
+
+    def infer(self, cond, masks=None):
+        batch_size, length = cond.size(0), cond.size(1)
+
+        output = []
+        x = torch.zeros((batch_size, 1)).to(cond.device)
+        h = None
+
+        for i in range(length):
+            x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h)
+            output.append(x)
+
+        output = torch.cat(output, dim=-1)
+
+        if masks is not None:
+            output = output.masked_fill(masks, 0.0)
+
+        return output
+
+
+class VarFsmnRnnNARPredictor(nn.Module):
+
+    def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units,
+                 ffn_inner_dim, dropout, shift, lstm_units):
+        super(VarFsmnRnnNARPredictor, self).__init__()
+
+        self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim,
+                                  num_memory_units, ffn_inner_dim, dropout,
+                                  shift)
+        self.blstm = nn.LSTM(
+            num_memory_units,
+            lstm_units,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True)
+        self.fc = nn.Linear(2 * lstm_units, 1)
+
+    def forward(self, inputs, masks=None):
+        input_lengths = None
+        if masks is not None:
+            input_lengths = torch.sum((~masks).float(), dim=1).long()
+
+        x = self.fsmn(inputs, masks)
+
+        if input_lengths is not None:
+            x = nn.utils.rnn.pack_padded_sequence(
+                x,
+                input_lengths.tolist(),
+                batch_first=True,
+                enforce_sorted=False)
+            x, _ = self.blstm(x)
+            x, _ = nn.utils.rnn.pad_packed_sequence(
+                x, batch_first=True, total_length=inputs.size(1))
+        else:
+            x, _ = self.blstm(x)
+
+        x = self.fc(x).squeeze(-1)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/base.py b/modelscope/models/audio/tts/models/models/sambert/base.py
new file mode 100644
index 00000000..873aecbf
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/base.py
@@ -0,0 +1,369 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledDotProductAttention(nn.Module):
+    """ Scaled Dot-Product Attention """
+
+    def __init__(self, temperature, dropatt=0.0):
+        super().__init__()
+        self.temperature = temperature
+        self.softmax = nn.Softmax(dim=2)
+        self.dropatt = nn.Dropout(dropatt)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn / self.temperature
+
+        if mask is not None:
+            attn = attn.masked_fill(mask, -np.inf)
+
+        attn = self.softmax(attn)
+        attn = self.dropatt(attn)
+        output = torch.bmm(attn, v)
+
+        return output, attn
+
+
+class Prenet(nn.Module):
+
+    def __init__(self, in_units, prenet_units, out_units=0):
+        super(Prenet, self).__init__()
+
+        self.fcs = nn.ModuleList()
+        for in_dim, out_dim in zip([in_units] + prenet_units[:-1],
+                                   prenet_units):
+            self.fcs.append(nn.Linear(in_dim, out_dim))
+            self.fcs.append(nn.ReLU())
+            self.fcs.append(nn.Dropout(0.5))
+
+        if (out_units):
+            self.fcs.append(nn.Linear(prenet_units[-1], out_units))
+
+    def forward(self, input):
+        output = input
+        for layer in self.fcs:
+            output = layer(output)
+        return output
+
+
+class MultiHeadSelfAttention(nn.Module):
+    """ Multi-Head SelfAttention module """
+
+    def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_in = d_in
+        self.d_model = d_model
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.fc = nn.Linear(n_head * d_head, d_model)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = input.size()
+
+        residual = input
+
+        x = self.layer_norm(input)
+        qkv = self.w_qkv(x)
+        q, k, v = qkv.chunk(3, -1)
+
+        q = q.view(sz_b, len_in, n_head, d_head)
+        k = k.view(sz_b, len_in, n_head, d_head)
+        v = v.view(sz_b, len_in, n_head, d_head)
+
+        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+
+        if mask is not None:
+            mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output, attn = self.attention(q, k, v, mask=mask)
+
+        output = output.view(n_head, sz_b, len_in, d_head)
+        output = (output.permute(1, 2, 0,
+                                 3).contiguous().view(sz_b, len_in,
+                                                      -1))  # b x l x (n*d)
+
+        output = self.dropout(self.fc(output))
+        if (output.size(-1) == residual.size(-1)):
+            output = output + residual
+
+        return output, attn
+
+
+class PositionwiseConvFeedForward(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self,
+                 d_in,
+                 d_hid,
+                 kernel_size=(3, 1),
+                 dropout_inner=0.1,
+                 dropout=0.1):
+        super().__init__()
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_in,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+        )
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout_inner = nn.Dropout(dropout_inner)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.layer_norm(x)
+
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(1), 0)
+        output = self.dropout_inner(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output
+
+
+class FFTBlock(nn.Module):
+    """FFT Block"""
+
+    def __init__(self,
+                 d_in,
+                 d_model,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(FFTBlock, self).__init__()
+        self.slf_attn = MultiHeadSelfAttention(
+            n_head,
+            d_in,
+            d_model,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self, input, mask=None, slf_attn_mask=None):
+        output, slf_attn = self.slf_attn(input, mask=slf_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, slf_attn
+
+
+class MultiHeadPNCAAttention(nn.Module):
+    """ Multi-Head Attention PNCA module """
+
+    def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_model = d_model
+        self.d_mem = d_mem
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+        self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head)
+        self.fc_x = nn.Linear(n_head * d_head, d_model)
+
+        self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head)
+        self.fc_h = nn.Linear(n_head * d_head, d_model)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def update_x_state(self, x):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_x, _ = x.size()
+
+        x_qkv = self.w_x_qkv(x)
+        x_q, x_k, x_v = x_qkv.chunk(3, -1)
+
+        x_q = x_q.view(sz_b, len_x, n_head, d_head)
+        x_k = x_k.view(sz_b, len_x, n_head, d_head)
+        x_v = x_v.view(sz_b, len_x, n_head, d_head)
+
+        x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+
+        if (self.x_state_size):
+            self.x_k = torch.cat([self.x_k, x_k], dim=1)
+            self.x_v = torch.cat([self.x_v, x_v], dim=1)
+        else:
+            self.x_k = x_k
+            self.x_v = x_v
+
+        self.x_state_size += len_x
+
+        return x_q, x_k, x_v
+
+    def update_h_state(self, h):
+        if (self.h_state_size == h.size(1)):
+            return None, None
+
+        d_head, n_head = self.d_head, self.n_head
+
+        # H
+        sz_b, len_h, _ = h.size()
+
+        h_kv = self.w_h_kv(h)
+        h_k, h_v = h_kv.chunk(2, -1)
+
+        h_k = h_k.view(sz_b, len_h, n_head, d_head)
+        h_v = h_v.view(sz_b, len_h, n_head, d_head)
+
+        self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+        self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+
+        self.h_state_size += len_h
+
+        return h_k, h_v
+
+    def reset_state(self):
+        self.h_k = None
+        self.h_v = None
+        self.h_state_size = 0
+        self.x_k = None
+        self.x_v = None
+        self.x_state_size = 0
+
+    def forward(self, x, h, mask_x=None, mask_h=None):
+        residual = x
+        self.update_h_state(h)
+        x_q, x_k, x_v = self.update_x_state(self.layer_norm(x))
+
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = x.size()
+
+        # X
+        if mask_x is not None:
+            mask_x = mask_x.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x)
+
+        output_x = output_x.view(n_head, sz_b, len_in, d_head)
+        output_x = (output_x.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_x = self.fc_x(output_x)
+
+        # H
+        if mask_h is not None:
+            mask_h = mask_h.repeat(n_head, 1, 1)
+        output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h)
+
+        output_h = output_h.view(n_head, sz_b, len_in, d_head)
+        output_h = (output_h.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_h = self.fc_h(output_h)
+
+        output = output_x + output_h
+
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output, attn_x, attn_h
+
+
+class PNCABlock(nn.Module):
+    """PNCA Block"""
+
+    def __init__(self,
+                 d_model,
+                 d_mem,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(PNCABlock, self).__init__()
+        self.pnca_attn = MultiHeadPNCAAttention(
+            n_head,
+            d_model,
+            d_mem,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self,
+                input,
+                memory,
+                mask=None,
+                pnca_x_attn_mask=None,
+                pnca_h_attn_mask=None):
+        output, pnca_attn_x, pnca_attn_h = self.pnca_attn(
+            input, memory, pnca_x_attn_mask, pnca_h_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, pnca_attn_x, pnca_attn_h
+
+    def reset_state(self):
+        self.pnca_attn.reset_state()
diff --git a/modelscope/models/audio/tts/models/models/sambert/fsmn.py b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
new file mode 100644
index 00000000..c070ef35
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+FSMN Pytorch Version
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FeedForwardNet(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
+        super().__init__()
+
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_out,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+            bias=False)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        output = self.dropout(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+
+        return output
+
+
+class MemoryBlockV2(nn.Module):
+
+    def __init__(self, d, filter_size, shift, dropout=0.0):
+        super(MemoryBlockV2, self).__init__()
+
+        left_padding = int(round((filter_size - 1) / 2))
+        right_padding = int((filter_size - 1) / 2)
+        if shift > 0:
+            left_padding += shift
+            right_padding -= shift
+
+        self.lp, self.rp = left_padding, right_padding
+
+        self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        x = F.pad(
+            input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0)
+        output = self.conv_dw(x.contiguous().transpose(
+            1, 2)).contiguous().transpose(1, 2)
+        output += input
+        output = self.dropout(output)
+
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output
+
+
+class FsmnEncoderV2(nn.Module):
+
+    def __init__(self,
+                 filter_size,
+                 fsmn_num_layers,
+                 input_dim,
+                 num_memory_units,
+                 ffn_inner_dim,
+                 dropout=0.0,
+                 shift=0):
+        super(FsmnEncoderV2, self).__init__()
+
+        self.filter_size = filter_size
+        self.fsmn_num_layers = fsmn_num_layers
+        self.num_memory_units = num_memory_units
+        self.ffn_inner_dim = ffn_inner_dim
+        self.dropout = dropout
+        self.shift = shift
+        if not isinstance(shift, list):
+            self.shift = [shift for _ in range(self.fsmn_num_layers)]
+
+        self.ffn_lst = nn.ModuleList()
+        self.ffn_lst.append(
+            FeedForwardNet(
+                input_dim, ffn_inner_dim, num_memory_units, dropout=dropout))
+        for i in range(1, fsmn_num_layers):
+            self.ffn_lst.append(
+                FeedForwardNet(
+                    num_memory_units,
+                    ffn_inner_dim,
+                    num_memory_units,
+                    dropout=dropout))
+
+        self.memory_block_lst = nn.ModuleList()
+        for i in range(fsmn_num_layers):
+            self.memory_block_lst.append(
+                MemoryBlockV2(num_memory_units, filter_size, self.shift[i],
+                              dropout))
+
+    def forward(self, input, mask=None):
+        x = F.dropout(input, self.dropout, self.training)
+        for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
+            context = ffn(x)
+            memory = memory_block(context, mask)
+            memory = F.dropout(memory, self.dropout, self.training)
+            if (memory.size(-1) == x.size(-1)):
+                memory += x
+            x = memory
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
new file mode 100644
index 00000000..3837a2e8
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
@@ -0,0 +1,718 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.audio.tts.models.utils import get_mask_from_lengths
+from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor,
+                       VarRnnARPredictor)
+from .base import FFTBlock, PNCABlock, Prenet
+from .fsmn import FsmnEncoderV2
+from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder
+
+
+class SelfAttentionEncoder(nn.Module):
+
+    def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner,
+                 dropout, dropout_att, dropout_relu, position_encoder):
+        super(SelfAttentionEncoder, self).__init__()
+
+        self.d_in = d_in
+        self.d_model = d_model
+        self.dropout = dropout
+        d_in_lst = [d_in] + [d_model] * (n_layer - 1)
+        self.fft = nn.ModuleList([
+            FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout,
+                     dropout_att, dropout_relu) for d in d_in_lst
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.position_enc = position_encoder
+
+    def forward(self, input, mask=None, return_attns=False):
+        input *= self.d_model**0.5
+        if (isinstance(self.position_enc, SinusoidalPositionEncoder)):
+            input = self.position_enc(input)
+        else:
+            raise NotImplementedError('modelscope error: position_enc invalid')
+
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        enc_slf_attn_list = []
+        max_len = input.size(1)
+        if mask is not None:
+            slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            slf_attn_mask = None
+
+        enc_output = input
+        for id, layer in enumerate(self.fft):
+            enc_output, enc_slf_attn = layer(
+                enc_output, mask=mask, slf_attn_mask=slf_attn_mask)
+            if return_attns:
+                enc_slf_attn_list += [enc_slf_attn]
+
+        enc_output = self.ln(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class HybridAttentionDecoder(nn.Module):
+
+    def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head,
+                 d_head, d_inner, dropout, dropout_att, dropout_relu, d_out):
+        super(HybridAttentionDecoder, self).__init__()
+
+        self.d_model = d_model
+        self.dropout = dropout
+        self.prenet = Prenet(d_in, prenet_units, d_model)
+        self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
+        self.pnca = nn.ModuleList([
+            PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout,
+                      dropout_att, dropout_relu) for _ in range(n_layer)
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.dec_out_proj = nn.Linear(d_model, d_out)
+
+    def reset_state(self):
+        for layer in self.pnca:
+            layer.reset_state()
+
+    def get_pnca_attn_mask(self,
+                           device,
+                           max_len,
+                           x_band_width,
+                           h_band_width,
+                           mask=None):
+        if mask is not None:
+            pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            pnca_attn_mask = None
+
+        range_ = torch.arange(max_len).to(device)
+        x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
+        x_end = (range_ + 1)[None, None, :]
+        h_start = range_[None, None, :]
+        h_end = torch.clamp_max(range_ + h_band_width + 1,
+                                max_len + 1)[None, None, :]
+
+        pnca_x_attn_mask = ~((x_start <= range_[None, :, None])
+                             & (x_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+        pnca_h_attn_mask = ~((h_start <= range_[None, :, None])
+                             & (h_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+
+        if pnca_attn_mask is not None:
+            pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask)
+            pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask)
+            pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+            pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+
+        return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
+
+    # must call reset_state before
+    def forward(self,
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=None,
+                return_attns=False):
+        input = self.prenet(input)
+        input = torch.cat([memory, input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        max_len = input.size(1)
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask,
+                pnca_x_attn_mask=pnca_x_attn_mask,
+                pnca_h_attn_mask=pnca_h_attn_mask)
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+    # must call reset_state before when step == 0
+    def infer(self,
+              step,
+              input,
+              memory,
+              x_band_width,
+              h_band_width,
+              mask=None,
+              return_attns=False):
+        max_len = memory.size(1)
+
+        input = self.prenet(input)
+        input = torch.cat([memory[:, step:step + 1, :], input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            if mask is not None:
+                mask_step = mask[:, step:step + 1]
+            else:
+                mask_step = None
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask_step,
+                pnca_x_attn_mask=pnca_x_attn_mask[:,
+                                                  step:step + 1, :(step + 1)],
+                pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :])
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class TextFftEncoder(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(TextFftEncoder, self).__init__()
+
+        # linguistic unit lookup table
+        nb_ling_sy = ling_unit_size['sy']
+        nb_ling_tone = ling_unit_size['tone']
+        nb_ling_syllable_flag = ling_unit_size['syllable_flag']
+        nb_ling_ws = ling_unit_size['word_segment']
+
+        max_len = config['am']['max_len']
+
+        d_emb = config['am']['embedding_dim']
+        nb_layers = config['am']['encoder_num_layers']
+        nb_heads = config['am']['encoder_num_heads']
+        d_model = config['am']['encoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['encoder_ffn_inner_dim']
+        dropout = config['am']['encoder_dropout']
+        dropout_attn = config['am']['encoder_attention_dropout']
+        dropout_relu = config['am']['encoder_relu_dropout']
+        d_proj = config['am']['encoder_projection_units']
+
+        self.d_model = d_model
+
+        self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+        self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+        self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+        self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
+
+        position_enc = SinusoidalPositionEncoder(max_len, d_emb)
+
+        self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model,
+                                             nb_heads, d_head, d_inner,
+                                             dropout, dropout_attn,
+                                             dropout_relu, position_enc)
+
+        self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
+
+    def forward(self, inputs_ling, masks=None, return_attns=False):
+        # Parse inputs_ling_seq
+        inputs_sy = inputs_ling[:, :, 0]
+        inputs_tone = inputs_ling[:, :, 1]
+        inputs_syllable_flag = inputs_ling[:, :, 2]
+        inputs_ws = inputs_ling[:, :, 3]
+
+        # Lookup table
+        sy_embedding = self.sy_emb(inputs_sy)
+        tone_embedding = self.tone_emb(inputs_tone)
+        syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
+        ws_embedding = self.ws_emb(inputs_ws)
+
+        ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
+
+        enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks,
+                                                      return_attns)
+
+        enc_output = self.ling_proj(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class VarianceAdaptor(nn.Module):
+
+    def __init__(self, config):
+        super(VarianceAdaptor, self).__init__()
+
+        input_dim = config['am']['encoder_projection_units'] + config['am'][
+            'emotion_units'] + config['am']['speaker_units']
+        filter_size = config['am']['predictor_filter_size']
+        fsmn_num_layers = config['am']['predictor_fsmn_num_layers']
+        num_memory_units = config['am']['predictor_num_memory_units']
+        ffn_inner_dim = config['am']['predictor_ffn_inner_dim']
+        dropout = config['am']['predictor_dropout']
+        shift = config['am']['predictor_shift']
+        lstm_units = config['am']['predictor_lstm_units']
+
+        dur_pred_prenet_units = config['am']['dur_pred_prenet_units']
+        dur_pred_lstm_units = config['am']['dur_pred_lstm_units']
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                      fsmn_num_layers,
+                                                      num_memory_units,
+                                                      ffn_inner_dim, dropout,
+                                                      shift, lstm_units)
+        self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                       fsmn_num_layers,
+                                                       num_memory_units,
+                                                       ffn_inner_dim, dropout,
+                                                       shift, lstm_units)
+        self.duration_predictor = VarRnnARPredictor(input_dim,
+                                                    dur_pred_prenet_units,
+                                                    dur_pred_lstm_units)
+
+        self.length_regulator = LengthRegulator(
+            config['am']['outputs_per_step'])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config['am']['encoder_projection_units'],
+            config['am']['outputs_per_step'])
+
+        self.pitch_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+        self.energy_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+
+    def forward(self,
+                inputs_text_embedding,
+                inputs_emo_embedding,
+                inputs_spk_embedding,
+                masks=None,
+                output_masks=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat([
+            inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs,
+                                                 masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs,
+                                                   masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(
+                pitch_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            pitch_embeddings = self.pitch_emb(
+                pitch_predictions.unsqueeze(1)).transpose(1, 2)
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(
+                energy_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            energy_embeddings = self.energy_emb(
+                energy_predictions.unsqueeze(1)).transpose(1, 2)
+
+        inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings
+        duration_predictor_cond = torch.cat([
+            inputs_text_embedding_aug, inputs_spk_embedding,
+            inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device)
+            duration_predictor_input = torch.cat([
+                duration_predictor_go_frame, duration_targets[:, :-1].float()
+            ], dim=-1)  # yapf:disable
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_targets,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets, masks=output_masks)
+
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_predictions,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions, masks=output_masks)
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs,
+                LR_length_rounded, log_duration_predictions, pitch_predictions,
+                energy_predictions)
+
+
+class MelPNCADecoder(nn.Module):
+
+    def __init__(self, config):
+        super(MelPNCADecoder, self).__init__()
+
+        prenet_units = config['am']['decoder_prenet_units']
+        nb_layers = config['am']['decoder_num_layers']
+        nb_heads = config['am']['decoder_num_heads']
+        d_model = config['am']['decoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['decoder_ffn_inner_dim']
+        dropout = config['am']['decoder_dropout']
+        dropout_attn = config['am']['decoder_attention_dropout']
+        dropout_relu = config['am']['decoder_relu_dropout']
+        outputs_per_step = config['am']['outputs_per_step']
+
+        d_mem = config['am'][
+            'encoder_projection_units'] * outputs_per_step + config['am'][
+                'emotion_units'] + config['am']['speaker_units']
+        d_mel = config['am']['num_mels']
+
+        self.d_mel = d_mel
+        self.r = outputs_per_step
+        self.nb_layers = nb_layers
+
+        self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers,
+                                              d_model, d_mem, nb_heads, d_head,
+                                              d_inner, dropout, dropout_attn,
+                                              dropout_relu,
+                                              d_mel * outputs_per_step)
+
+    def forward(self,
+                memory,
+                x_band_width,
+                h_band_width,
+                target=None,
+                mask=None,
+                return_attns=False):
+        batch_size = memory.size(0)
+        go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
+
+        if target is not None:
+            self.mel_dec.reset_state()
+            input = target[:, self.r - 1::self.r, :]
+            input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
+            dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=mask,
+                return_attns=return_attns)
+
+        else:
+            dec_output = []
+            dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
+            dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
+            self.mel_dec.reset_state()
+            input = go_frame
+            for step in range(memory.size(1)):
+                dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer(
+                    step,
+                    input,
+                    memory,
+                    x_band_width,
+                    h_band_width,
+                    mask=mask,
+                    return_attns=return_attns)
+                input = dec_output_step[:, :, -self.d_mel:]
+
+                dec_output.append(dec_output_step)
+                for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
+                        zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)):
+                    left = memory.size(1) - pnca_x_attn.size(-1)
+                    if (left > 0):
+                        padding = torch.zeros(
+                            (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn)
+                        pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
+                    dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
+                    dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
+
+            dec_output = torch.cat(dec_output, dim=1)
+            for layer_id in range(self.nb_layers):
+                dec_pnca_attn_x_list[layer_id] = torch.cat(
+                    dec_pnca_attn_x_list[layer_id], dim=1)
+                dec_pnca_attn_h_list[layer_id] = torch.cat(
+                    dec_pnca_attn_h_list[layer_id], dim=1)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class PostNet(nn.Module):
+
+    def __init__(self, config):
+        super(PostNet, self).__init__()
+
+        self.filter_size = config['am']['postnet_filter_size']
+        self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers']
+        self.num_memory_units = config['am']['postnet_num_memory_units']
+        self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim']
+        self.dropout = config['am']['postnet_dropout']
+        self.shift = config['am']['postnet_shift']
+        self.lstm_units = config['am']['postnet_lstm_units']
+        self.num_mels = config['am']['num_mels']
+
+        self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers,
+                                  self.num_mels, self.num_memory_units,
+                                  self.ffn_inner_dim, self.dropout, self.shift)
+        self.lstm = nn.LSTM(
+            self.num_memory_units,
+            self.lstm_units,
+            num_layers=1,
+            batch_first=True)
+        self.fc = nn.Linear(self.lstm_units, self.num_mels)
+
+    def forward(self, x, mask=None):
+        postnet_fsmn_output = self.fsmn(x, mask)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simpliciy due to the mask and uni-directional lstm.
+        postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
+        mel_residual_output = self.fc(postnet_lstm_output)
+
+        return mel_residual_output
+
+
+def mel_recon_loss_fn(output_lengths,
+                      mel_targets,
+                      dec_outputs,
+                      postnet_outputs=None):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    output_masks = get_mask_from_lengths(
+        output_lengths, max_len=mel_targets.size(1))
+    output_masks = ~output_masks
+    valid_outputs = output_masks.sum()
+
+    mel_loss_ = torch.sum(
+        mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / (
+            valid_outputs * mel_targets.size(-1))
+
+    if postnet_outputs is not None:
+        mel_loss = torch.sum(
+            mae_loss(mel_targets, postnet_outputs)
+            * output_masks.unsqueeze(-1)) / (
+                valid_outputs * mel_targets.size(-1))
+    else:
+        mel_loss = 0.0
+
+    return mel_loss_, mel_loss
+
+
+def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets,
+                          energy_targets, log_duration_predictions,
+                          pitch_predictions, energy_predictions):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    input_masks = get_mask_from_lengths(
+        input_lengths, max_len=duration_targets.size(1))
+    input_masks = ~input_masks
+    valid_inputs = input_masks.sum()
+
+    dur_loss = torch.sum(
+        mae_loss(
+            torch.log(duration_targets.float() + 1), log_duration_predictions)
+        * input_masks) / valid_inputs
+    pitch_loss = torch.sum(
+        mae_loss(pitch_targets, pitch_predictions)
+        * input_masks) / valid_inputs
+    energy_loss = torch.sum(
+        mae_loss(energy_targets, energy_predictions)
+        * input_masks) / valid_inputs
+
+    return dur_loss, pitch_loss, energy_loss
+
+
+class KanTtsSAMBERT(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(KanTtsSAMBERT, self).__init__()
+
+        self.text_encoder = TextFftEncoder(config, ling_unit_size)
+        self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'],
+                                          config['am']['speaker_units'])
+        self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'],
+                                          config['am']['emotion_units'])
+        self.variance_adaptor = VarianceAdaptor(config)
+        self.mel_decoder = MelPNCADecoder(config)
+        self.mel_postnet = PostNet(config)
+
+    def get_lfr_mask_from_lengths(self, lengths, max_len):
+        batch_size = lengths.size(0)
+        # padding according to the outputs_per_step
+        padded_lr_lengths = torch.zeros_like(lengths)
+        for i in range(batch_size):
+            len_item = int(lengths[i].item())
+            padding = self.mel_decoder.r - len_item % self.mel_decoder.r
+            if (padding < self.mel_decoder.r):
+                padded_lr_lengths[i] = (len_item
+                                        + padding) // self.mel_decoder.r
+            else:
+                padded_lr_lengths[i] = len_item // self.mel_decoder.r
+
+        return get_mask_from_lengths(
+            padded_lr_lengths, max_len=max_len // self.mel_decoder.r)
+
+    def forward(self,
+                inputs_ling,
+                inputs_emotion,
+                inputs_speaker,
+                input_lengths,
+                output_lengths=None,
+                mel_targets=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_ling.size(0)
+
+        input_masks = get_mask_from_lengths(
+            input_lengths, max_len=inputs_ling.size(1))
+
+        text_hid, enc_sla_attn_lst = self.text_encoder(
+            inputs_ling, input_masks, return_attns=True)
+
+        emo_hid = self.emo_tokenizer(inputs_emotion)
+        spk_hid = self.spk_tokenizer(inputs_speaker)
+
+        if output_lengths is not None:
+            output_masks = get_mask_from_lengths(
+                output_lengths, max_len=mel_targets.size(1))
+        else:
+            output_masks = None
+
+        (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded,
+         log_duration_predictions, pitch_predictions,
+         energy_predictions) = self.variance_adaptor(
+             text_hid,
+             emo_hid,
+             spk_hid,
+             masks=input_masks,
+             output_masks=output_masks,
+             duration_targets=duration_targets,
+             pitch_targets=pitch_targets,
+             energy_targets=energy_targets)
+
+        if output_lengths is not None:
+            lfr_masks = self.get_lfr_mask_from_lengths(
+                output_lengths, max_len=LR_text_outputs.size(1))
+        else:
+            output_masks = get_mask_from_lengths(
+                LR_length_rounded, max_len=LR_text_outputs.size(1))
+            lfr_masks = None
+
+        # LFR with the factor of outputs_per_step
+        LFR_text_inputs = LR_text_outputs.contiguous().view(
+            batch_size, -1, self.mel_decoder.r * text_hid.shape[-1])
+        LFR_emo_inputs = LR_emo_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]]
+        LFR_spk_inputs = LR_spk_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]]
+
+        memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs],
+                           dim=-1)
+
+        if duration_targets is not None:
+            x_band_width = int(
+                duration_targets.float().masked_fill(input_masks, 0).max()
+                / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+        else:
+            x_band_width = int((torch.exp(log_duration_predictions) - 1).max()
+                               / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+
+        dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder(
+            memory,
+            x_band_width,
+            h_band_width,
+            target=mel_targets,
+            mask=lfr_masks,
+            return_attns=True)
+
+        # De-LFR with the factor of outputs_per_step
+        dec_outputs = dec_outputs.contiguous().view(batch_size, -1,
+                                                    self.mel_decoder.d_mel)
+
+        if output_masks is not None:
+            dec_outputs = dec_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        postnet_outputs = self.mel_postnet(dec_outputs,
+                                           output_masks) + dec_outputs
+        if output_masks is not None:
+            postnet_outputs = postnet_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        res = {
+            'x_band_width': x_band_width,
+            'h_band_width': h_band_width,
+            'enc_slf_attn_lst': enc_sla_attn_lst,
+            'pnca_x_attn_lst': pnca_x_attn_lst,
+            'pnca_h_attn_lst': pnca_h_attn_lst,
+            'dec_outputs': dec_outputs,
+            'postnet_outputs': postnet_outputs,
+            'LR_length_rounded': LR_length_rounded,
+            'log_duration_predictions': log_duration_predictions,
+            'pitch_predictions': pitch_predictions,
+            'energy_predictions': energy_predictions
+        }
+
+        res['LR_text_outputs'] = LR_text_outputs
+        res['LR_emo_outputs'] = LR_emo_outputs
+        res['LR_spk_outputs'] = LR_spk_outputs
+
+        return res
diff --git a/modelscope/models/audio/tts/models/models/sambert/positions.py b/modelscope/models/audio/tts/models/models/sambert/positions.py
new file mode 100644
index 00000000..9d1e375d
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/positions.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, max_len, depth):
+        super(SinusoidalPositionEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.depth = depth
+        self.position_enc = nn.Parameter(
+            self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
+            requires_grad=False)
+
+    def forward(self, input):
+        bz_in, len_in, _ = input.size()
+        if len_in > self.max_len:
+            self.max_len = len_in
+            self.position_enc.data = self.get_sinusoid_encoding_table(
+                self.max_len, self.depth).unsqueeze(0).to(input.device)
+
+        output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
+
+        return output
+
+    @staticmethod
+    def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+        """ Sinusoid position encoding table """
+
+        def cal_angle(position, hid_idx):
+            return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
+
+        def get_posi_angle_vec(position):
+            return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
+
+        scaled_time_table = np.array(
+            [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)])
+
+        sinusoid_table = np.zeros((n_position, d_hid))
+        sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table)
+        sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table)
+
+        if padding_idx is not None:
+            # zero vector for padding dimension
+            sinusoid_table[padding_idx] = 0.0
+
+        return torch.FloatTensor(sinusoid_table)
+
+
+class DurSinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, depth, outputs_per_step):
+        super(DurSinusoidalPositionEncoder, self).__init__()
+
+        self.depth = depth
+        self.outputs_per_step = outputs_per_step
+
+        inv_timescales = [
+            np.power(10000, 2 * (hid_idx // 2) / depth)
+            for hid_idx in range(depth)
+        ]
+        self.inv_timescales = nn.Parameter(
+            torch.FloatTensor(inv_timescales), requires_grad=False)
+
+    def forward(self, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(durations.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        offsets = torch.matmul(mult,
+                               reps_cumsum[:,
+                                           0, :-1].unsqueeze(-1)).squeeze(-1)
+        dur_pos = range_[:, :, 0] - offsets + 1
+
+        if masks is not None:
+            assert masks.size(1) == dur_pos.size(1)
+            dur_pos = dur_pos.masked_fill(masks, 0.0)
+
+        seq_len = dur_pos.size(1)
+        padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
+        if (padding < self.outputs_per_step):
+            dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
+
+        position_embedding = dur_pos[:, :, None] / self.inv_timescales[None,
+                                                                       None, :]
+        position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :,
+                                                                      0::2])
+        position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :,
+                                                                      1::2])
+
+        return position_embedding
diff --git a/modelscope/models/audio/tts/models/position.py b/modelscope/models/audio/tts/models/position.py
deleted file mode 100755
index bca658dd..00000000
--- a/modelscope/models/audio/tts/models/position.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""Define position encoder classes."""
-
-import abc
-import math
-
-import tensorflow as tf
-
-from .reducer import SumReducer
-
-
-class PositionEncoder(tf.keras.layers.Layer):
-    """Base class for position encoders."""
-
-    def __init__(self, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEncoder, self).__init__(**kwargs)
-        if reducer is None:
-            reducer = SumReducer(dtype=kwargs.get('dtype'))
-        self.reducer = reducer
-
-    def call(self, inputs, position=None):  # pylint: disable=arguments-differ
-        """Add position encodings to :obj:`inputs`.
-        Args:
-          inputs: The inputs to encode.
-          position: The single position to encode, to use when this layer is called
-            step by step.
-        Returns:
-          A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
-        """
-        batch_size = tf.shape(inputs)[0]
-        timesteps = tf.shape(inputs)[1]
-        input_dim = inputs.shape[-1].value
-        positions = tf.range(timesteps) + 1 if position is None else [position]
-        position_encoding = self._encode([positions], input_dim)
-        position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
-        return self.reducer([inputs, position_encoding])
-
-    @abc.abstractmethod
-    def _encode(self, positions, depth):
-        """Creates position encodings.
-        Args:
-          positions: The positions to encode of shape :math:`[B, ...]`.
-          depth: The encoding depth :math:`D`.
-        Returns:
-          A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
-        """
-        raise NotImplementedError()
-
-
-class PositionEmbedder(PositionEncoder):
-    """Encodes position with a lookup table."""
-
-    def __init__(self, maximum_position=128, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          maximum_position: The maximum position to embed. Positions greater
-            than this value will be set to :obj:`maximum_position`.
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
-        self.maximum_position = maximum_position
-        self.embedding = None
-
-    def build(self, input_shape):
-        shape = [self.maximum_position + 1, input_shape[-1]]
-        self.embedding = self.add_weight('position_embedding', shape)
-        super(PositionEmbedder, self).build(input_shape)
-
-    def _encode(self, positions, depth):
-        positions = tf.minimum(positions, self.maximum_position)
-        return tf.nn.embedding_lookup(self.embedding, positions)
-
-
-class SinusoidalPositionEncoder(PositionEncoder):
-    """Encodes positions with sine waves as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def _encode(self, positions, depth):
-        if depth % 2 != 0:
-            raise ValueError(
-                'SinusoidalPositionEncoder expects the depth to be divisble '
-                'by 2 but got %d' % depth)
-
-        batch_size = tf.shape(positions)[0]
-        positions = tf.cast(positions, tf.float32)
-
-        log_timescale_increment = math.log(10000) / (depth / 2 - 1)
-        inv_timescales = tf.exp(
-            tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
-        inv_timescales = tf.reshape(
-            tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
-        scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
-            inv_timescales, 1)
-        encoding = tf.concat(
-            [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
-        return tf.cast(encoding, self.dtype)
-
-
-class SinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='SinusodalPositionalEncoding'):
-        super(SinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(len, dim, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
-                * step,
-                axis=-1), [1, dim])
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                axis=0), [len, 1])
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [time, dims]
-            tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
-
-
-class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='BatchSinusodalPositionalEncoding'):
-        super(BatchSinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :param pos_mat: [B, len] = [len, 1] * dim
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
-            [1, 1, dim])  # [B, len, dim]
-
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.expand_dims(
-                    tf.range(
-                        0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                    axis=0),
-                axis=0), [batch_size, len, 1])  # [B, len, dim]
-
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [B, time, dims]
-            tf.math.equal(tf.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
diff --git a/modelscope/models/audio/tts/models/reducer.py b/modelscope/models/audio/tts/models/reducer.py
deleted file mode 100755
index a4c9ae17..00000000
--- a/modelscope/models/audio/tts/models/reducer.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Define reducers: objects that merge inputs."""
-
-import abc
-import functools
-
-import tensorflow as tf
-
-
-def pad_in_time(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
-    return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-
-
-def align_in_time(x, length):
-    """Aligns the time dimension of :obj:`x` with :obj:`length`."""
-    time_dim = tf.shape(x)[1]
-    return tf.cond(
-        tf.less(time_dim, length),
-        true_fn=lambda: pad_in_time(x, length - time_dim),
-        false_fn=lambda: x[:, :length])
-
-
-def pad_with_identity(x,
-                      sequence_length,
-                      max_sequence_length,
-                      identity_values=0,
-                      maxlen=None):
-    """Pads a tensor with identity values up to :obj:`max_sequence_length`.
-    Args:
-      x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
-      sequence_length: The true sequence length of :obj:`x`.
-      max_sequence_length: The sequence length up to which the tensor must contain
-        :obj:`identity values`.
-      identity_values: The identity value.
-      maxlen: Size of the output time dimension. Default is the maximum value in
-        obj:`max_sequence_length`.
-    Returns:
-      A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
-    """
-    if maxlen is None:
-        maxlen = tf.reduce_max(max_sequence_length)
-
-    mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask = tf.expand_dims(mask, axis=-1)
-    mask_combined = tf.sequence_mask(
-        max_sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask_combined = tf.expand_dims(mask_combined, axis=-1)
-
-    identity_mask = mask_combined * (1.0 - mask)
-
-    x = pad_in_time(x, maxlen - tf.shape(x)[1])
-    x = x * mask + (identity_mask * identity_values)
-
-    return x
-
-
-def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
-    """Pads each input tensors with identity values up to
-    ``max(sequence_lengths)`` for each batch.
-    Args:
-      inputs: A list of ``tf.Tensor``.
-      sequence_lengths: A list of sequence length.
-      identity_values: The identity value.
-    Returns:
-      A tuple ``(padded, max_sequence_length)`` which are respectively a list of
-      ``tf.Tensor`` where each tensor are padded with identity and the combined
-      sequence length.
-    """
-    max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
-    maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
-    padded = [
-        pad_with_identity(
-            x,
-            length,
-            max_sequence_length,
-            identity_values=identity_values,
-            maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
-    ]
-    return padded, max_sequence_length
-
-
-class Reducer(tf.keras.layers.Layer):
-    """Base class for reducers."""
-
-    def zip_and_reduce(self, x, y):
-        """Zips the :obj:`x` with :obj:`y` structures together and reduces all
-        elements. If the structures are nested, they will be flattened first.
-        Args:
-          x: The first structure.
-          y: The second structure.
-        Returns:
-          The same structure as :obj:`x` and :obj:`y` where each element from
-          :obj:`x` is reduced with the correspond element from :obj:`y`.
-        Raises:
-          ValueError: if the two structures are not the same.
-        """
-        tf.nest.assert_same_structure(x, y)
-        x_flat = tf.nest.flatten(x)
-        y_flat = tf.nest.flatten(y)
-        reduced = list(map(self, zip(x_flat, y_flat)))
-        return tf.nest.pack_sequence_as(x, reduced)
-
-    def call(self, inputs, sequence_length=None):  # pylint: disable=arguments-differ
-        """Reduces all input elements.
-        Args:
-          inputs: A list of ``tf.Tensor``.
-          sequence_length: The length of each input, if reducing sequences.
-        Returns:
-          If :obj:`sequence_length` is set, a tuple
-          ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
-          only.
-        """
-        if sequence_length is None:
-            return self.reduce(inputs)
-        else:
-            return self.reduce_sequence(
-                inputs, sequence_lengths=sequence_length)
-
-    @abc.abstractmethod
-    def reduce(self, inputs):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def reduce_sequence(self, inputs, sequence_lengths):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-
-class SumReducer(Reducer):
-    """A reducer that sums the inputs."""
-
-    def reduce(self, inputs):
-        if len(inputs) == 1:
-            return inputs[0]
-        if len(inputs) == 2:
-            return inputs[0] + inputs[1]
-        return tf.add_n(inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=0)
-        return self.reduce(padded), combined_length
-
-
-class MultiplyReducer(Reducer):
-    """A reducer that multiplies the inputs."""
-
-    def reduce(self, inputs):
-        return functools.reduce(lambda a, x: a * x, inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=1)
-        return self.reduce(padded), combined_length
diff --git a/modelscope/models/audio/tts/models/rnn_wrappers.py b/modelscope/models/audio/tts/models/rnn_wrappers.py
deleted file mode 100755
index 6c487bab..00000000
--- a/modelscope/models/audio/tts/models/rnn_wrappers.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops import rnn_cell_impl
-
-from .am_models import prenet
-
-
-class VarPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(DurPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-        new_super_cell_out = tf.nn.relu(new_super_cell_out)
-        #    new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCECell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
-                 max_dur, dur_embedding_dim):
-        super(DurPredictorCECell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-        self._max_dur = max_dur
-        self._dur_embedding_dim = dur_embedding_dim
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._max_dur
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = tf.squeeze(
-            tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1)  # [N]
-        prenet_input = tf.one_hot(
-            prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
-            axis=-1)  # [N, 120]
-        prenet_input = tf.layers.dense(
-            prenet_input, units=self._dur_embedding_dim)
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._max_dur)  # [N, 120]
-        new_super_cell_out = tf.nn.softmax(new_super_cell_out)  # [N, 120]
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class VarPredictorCell2(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell2, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        '''Run the Tacotron2 super decoder cell.'''
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        # split and relu
-        new_super_cell_out = tf.concat([
-            tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
-        ], axis=-1)  # yapf:disable
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
diff --git a/modelscope/models/audio/tts/models/robutrans.py b/modelscope/models/audio/tts/models/robutrans.py
deleted file mode 100755
index ab9fdfcc..00000000
--- a/modelscope/models/audio/tts/models/robutrans.py
+++ /dev/null
@@ -1,760 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops.ragged.ragged_util import repeat
-
-from .fsmn_encoder import FsmnEncoderV2
-from .position import BatchSinusodalPositionalEncoding
-from .self_attention_decoder import SelfAttentionDecoder
-from .self_attention_encoder import SelfAttentionEncoder
-
-
-class RobuTrans():
-
-    def __init__(self, hparams):
-        self._hparams = hparams
-
-    def initialize(self,
-                   inputs,
-                   inputs_emotion,
-                   inputs_speaker,
-                   input_lengths,
-                   output_lengths=None,
-                   mel_targets=None,
-                   durations=None,
-                   pitch_contours=None,
-                   uv_masks=None,
-                   pitch_scales=None,
-                   duration_scales=None,
-                   energy_contours=None,
-                   energy_scales=None):
-        """Initializes the model for inference.
-
-        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.
-
-        Args:
-          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
-            steps in the input time series, and values are character IDs
-          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in inputs.
-          output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in outputs.
-          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
-            of steps in the output time series, M is num_mels, and values are entries in the mel
-            spectrogram. Only needed for training.
-        """
-        from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
-        from tensorflow.contrib.seq2seq import BasicDecoder
-
-        with tf.variable_scope('inference') as _:
-            is_training = mel_targets is not None
-            batch_size = tf.shape(inputs)[0]
-            hp = self._hparams
-
-            input_mask = None
-            if input_lengths is not None and is_training:
-                input_mask = tf.sequence_mask(
-                    input_lengths, tf.shape(inputs)[1], dtype=tf.float32)
-
-            if input_mask is not None:
-                inputs = inputs * tf.expand_dims(input_mask, -1)
-
-            # speaker embedding
-            embedded_inputs_speaker = tf.layers.dense(
-                inputs_speaker,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # emotion embedding
-            embedded_inputs_emotion = tf.layers.dense(
-                inputs_emotion,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # symbol embedding
-            with tf.variable_scope('Embedding'):
-                embedded_inputs = tf.layers.dense(
-                    inputs,
-                    hp.embedding_dim,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # Encoder
-            with tf.variable_scope('Encoder'):
-                Encoder = SelfAttentionEncoder(
-                    num_layers=hp.encoder_num_layers,
-                    num_units=hp.encoder_num_units,
-                    num_heads=hp.encoder_num_heads,
-                    ffn_inner_dim=hp.encoder_ffn_inner_dim,
-                    dropout=hp.encoder_dropout,
-                    attention_dropout=hp.encoder_attention_dropout,
-                    relu_dropout=hp.encoder_relu_dropout)
-                encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
-                    embedded_inputs,
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                encoder_outputs = tf.layers.dense(
-                    encoder_outputs,
-                    hp.encoder_projection_units,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # pitch and energy
-            var_inputs = tf.concat([
-                encoder_outputs, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                var_inputs = var_inputs * tf.expand_dims(input_mask, -1)
-
-            with tf.variable_scope('Pitch_Predictor'):
-                Pitch_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    pitch_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                pitch_contour_outputs = tf.concat(
-                    pitch_contour_outputs, axis=-1)
-                pitch_contour_outputs = tf.layers.dense(
-                    pitch_contour_outputs, units=1)  # [N, T_in, 1]
-                pitch_contour_outputs = tf.squeeze(
-                    pitch_contour_outputs, axis=2)  # [N, T_in]
-
-            with tf.variable_scope('Energy_Predictor'):
-                Energy_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    energy_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                energy_contour_outputs = tf.concat(
-                    energy_contour_outputs, axis=-1)
-                energy_contour_outputs = tf.layers.dense(
-                    energy_contour_outputs, units=1)  # [N, T_in, 1]
-                energy_contour_outputs = tf.squeeze(
-                    energy_contour_outputs, axis=2)  # [N, T_in]
-
-            if is_training:
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contours, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_embeddings = tf.expand_dims(
-                    energy_contours, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-            else:
-                pitch_contour_outputs *= pitch_scales
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contour_outputs, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_contour_outputs *= energy_scales
-                energy_embeddings = tf.expand_dims(
-                    energy_contour_outputs, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-
-            encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings
-
-            # duration
-            dur_inputs = tf.concat([
-                encoder_outputs_, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
-            with tf.variable_scope('Duration_Predictor'):
-                duration_predictor_cell = MultiRNNCell([
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units)
-                ], state_is_tuple=True)  # yapf:disable
-                from .rnn_wrappers import DurPredictorCell
-                duration_output_cell = DurPredictorCell(
-                    duration_predictor_cell, is_training, 1,
-                    hp.predictor_prenet_units)
-                duration_predictor_init_state = duration_output_cell.zero_state(
-                    batch_size=batch_size, dtype=tf.float32)
-                if is_training:
-                    from .helpers import VarTrainingHelper
-                    duration_helper = VarTrainingHelper(
-                        tf.expand_dims(
-                            tf.log(tf.cast(durations, tf.float32) + 1),
-                            axis=2), dur_inputs, 1)
-                else:
-                    from .helpers import VarTestHelper
-                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
-                (
-                    duration_outputs, _
-                ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
-                    BasicDecoder(duration_output_cell, duration_helper,
-                                 duration_predictor_init_state),
-                    maximum_iterations=1000)
-                duration_outputs = tf.squeeze(
-                    duration_outputs, axis=2)  # [N, T_in]
-                if input_mask is not None:
-                    duration_outputs = duration_outputs * input_mask
-                duration_outputs_ = tf.exp(duration_outputs) - 1
-
-            # Length Regulator
-            with tf.variable_scope('Length_Regulator'):
-                if is_training:
-                    i = tf.constant(1)
-                    # position embedding
-                    j = tf.constant(1)
-                    dur_len = tf.shape(durations)[-1]
-                    embedded_position_i = tf.range(1, durations[0, 0] + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(1, durations[0, j] + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-
-                    # others
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-
-                    def condition(i, pos, layer, s, e):
-                        return tf.less(i, tf.shape(mel_targets)[0])
-
-                    def loop_body(i, embedded_position, LR_outputs,
-                                  embedded_outputs_speaker,
-                                  embedded_outputs_emotion):
-                        # position embedding
-                        jj = tf.constant(1)
-                        embedded_position_i = tf.range(1, durations[i, 0] + 1)
-
-                        def condition_pos_i(j, e):
-                            return tf.less(j, dur_len)
-
-                        def loop_body_pos_i(j, embedded_position_i):
-                            embedded_position_i = tf.concat([
-                                embedded_position_i,
-                                tf.range(1, durations[i, j] + 1)
-                            ], axis=0)  # yapf:disable
-                            return [j + 1, embedded_position_i]
-
-                        jj, embedded_position_i = tf.while_loop(
-                            condition_pos_i,
-                            loop_body_pos_i, [jj, embedded_position_i],
-                            shape_invariants=[
-                                jj.get_shape(),
-                                tf.TensorShape([None])
-                            ])
-                        embedded_position = tf.concat([
-                            embedded_position,
-                            tf.reshape(embedded_position_i, (1, -1))
-                        ], 0)
-
-                        # others
-                        LR_outputs = tf.concat([
-                            LR_outputs,
-                            repeat(
-                                encoder_outputs_[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_speaker = tf.concat([
-                            embedded_outputs_speaker,
-                            repeat(
-                                embedded_inputs_speaker[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_emotion = tf.concat([
-                            embedded_outputs_emotion,
-                            repeat(
-                                embedded_inputs_emotion[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        return [
-                            i + 1, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ]
-
-                    i, embedded_position, LR_outputs,
-                    embedded_outputs_speaker,
-                    embedded_outputs_emotion = tf.while_loop(
-                        condition,
-                        loop_body, [
-                            i, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ],
-                        shape_invariants=[
-                            i.get_shape(),
-                            tf.TensorShape([None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None])
-                        ],
-                        parallel_iterations=hp.batch_size)
-
-                    ori_framenum = tf.shape(mel_targets)[1]
-                else:
-                    # position
-                    j = tf.constant(1)
-                    dur_len = tf.shape(duration_outputs_)[-1]
-                    embedded_position_i = tf.range(
-                        1,
-                        tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
-                        + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(
-                                1,
-                                tf.cast(
-                                    tf.round(duration_outputs_)[0, j],
-                                    tf.int32) + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-                    # others
-                    duration_outputs_ *= duration_scales
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    ori_framenum = tf.shape(LR_outputs)[1]
-
-                    left = hp.outputs_per_step - tf.mod(
-                        ori_framenum, hp.outputs_per_step)
-                    LR_outputs = tf.cond(
-                        tf.equal(left,
-                                 hp.outputs_per_step), lambda: LR_outputs,
-                        lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
-                                       'CONSTANT'))
-                    embedded_outputs_speaker = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_speaker, lambda: tf.pad(
-                            embedded_outputs_speaker, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_outputs_emotion = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_emotion, lambda: tf.pad(
-                            embedded_outputs_emotion, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_position = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_position,
-                        lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
-                                       'CONSTANT'))
-
-            # Pos_Embedding
-            with tf.variable_scope('Position_Embedding'):
-                Pos_Embedding = BatchSinusodalPositionalEncoding()
-                position_embeddings = Pos_Embedding.positional_encoding(
-                    batch_size,
-                    tf.shape(LR_outputs)[1], hp.encoder_projection_units,
-                    embedded_position)
-            LR_outputs += position_embeddings
-
-            # multi-frame
-            LR_outputs = tf.reshape(LR_outputs, [
-                batch_size, -1,
-                hp.outputs_per_step * hp.encoder_projection_units
-            ])
-            embedded_outputs_speaker = tf.reshape(
-                embedded_outputs_speaker,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            embedded_outputs_emotion = tf.reshape(
-                embedded_outputs_emotion,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
-            LR_outputs = tf.concat([
-                LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
-            ], -1)
-
-            # auto bandwidth
-            if is_training:
-                durations_mask = tf.cast(durations,
-                                         tf.float32) * input_mask  # [N, T_in]
-            else:
-                durations_mask = duration_outputs_
-            X_band_width = tf.cast(
-                tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
-                tf.int32)
-            H_band_width = X_band_width
-
-            with tf.variable_scope('Decoder'):
-                Decoder = SelfAttentionDecoder(
-                    num_layers=hp.decoder_num_layers,
-                    num_units=hp.decoder_num_units,
-                    num_heads=hp.decoder_num_heads,
-                    ffn_inner_dim=hp.decoder_ffn_inner_dim,
-                    dropout=hp.decoder_dropout,
-                    attention_dropout=hp.decoder_attention_dropout,
-                    relu_dropout=hp.decoder_relu_dropout,
-                    prenet_units=hp.prenet_units,
-                    dense_units=hp.prenet_proj_units,
-                    num_mels=hp.num_mels,
-                    outputs_per_step=hp.outputs_per_step,
-                    X_band_width=X_band_width,
-                    H_band_width=H_band_width,
-                    position_encoder=None)
-                if is_training:
-                    if hp.free_run:
-                        r = hp.outputs_per_step
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                            init_decoder_input,
-                            maximum_iterations=tf.shape(LR_outputs)[1],
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                    else:
-                        r = hp.outputs_per_step
-                        decoder_input = mel_targets[:, r - 1::
-                                                    r, :]  # [N, T_out / r, hp.num_mels]
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input = tf.concat(
-                            [init_decoder_input, decoder_input],
-                            axis=1)  # [N, T_out / r + 1, hp.num_mels]
-                        decoder_input = decoder_input[:, :
-                                                      -1, :]  # [N, T_out / r, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
-                            decoder_input,
-                            decoder_input_lengths,
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                else:
-                    init_decoder_input = tf.expand_dims(
-                        tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                        axis=1)  # [N, 1, hp.num_mels]
-                    decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                        init_decoder_input,
-                        maximum_iterations=tf.shape(LR_outputs)[1],
-                        mode=is_training,
-                        memory=LR_outputs,
-                        memory_sequence_length=tf.expand_dims(
-                            tf.shape(LR_outputs)[1], axis=0))
-
-                if is_training:
-                    mel_outputs_ = tf.reshape(decoder_outputs,
-                                              [batch_size, -1, hp.num_mels])
-                else:
-                    mel_outputs_ = tf.reshape(
-                        decoder_outputs,
-                        [batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
-                mel_outputs = mel_outputs_
-
-            with tf.variable_scope('Postnet'):
-                Postnet_FSMN = FsmnEncoderV2(
-                    filter_size=hp.postnet_filter_size,
-                    fsmn_num_layers=hp.postnet_fsmn_num_layers,
-                    dnn_num_layers=hp.postnet_dnn_num_layers,
-                    num_memory_units=hp.postnet_num_memory_units,
-                    ffn_inner_dim=hp.postnet_ffn_inner_dim,
-                    dropout=hp.postnet_dropout,
-                    shift=hp.postnet_shift,
-                    position_encoder=None)
-                if is_training:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=output_lengths,
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=output_lengths,
-                        dtype=tf.float32)
-                else:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        dtype=tf.float32)
-
-            mel_residual_outputs = tf.layers.dense(
-                hidden_lstm_outputs, units=hp.num_mels)
-            mel_outputs += mel_residual_outputs
-
-            self.inputs = inputs
-            self.inputs_speaker = inputs_speaker
-            self.inputs_emotion = inputs_emotion
-            self.input_lengths = input_lengths
-            self.durations = durations
-            self.output_lengths = output_lengths
-            self.mel_outputs_ = mel_outputs_
-            self.mel_outputs = mel_outputs
-            self.mel_targets = mel_targets
-            self.duration_outputs = duration_outputs
-            self.duration_outputs_ = duration_outputs_
-            self.duration_scales = duration_scales
-            self.pitch_contour_outputs = pitch_contour_outputs
-            self.pitch_contours = pitch_contours
-            self.pitch_scales = pitch_scales
-            self.energy_contour_outputs = energy_contour_outputs
-            self.energy_contours = energy_contours
-            self.energy_scales = energy_scales
-            self.uv_masks_ = uv_masks
-
-            self.embedded_inputs_emotion = embedded_inputs_emotion
-            self.embedding_fsmn_outputs = embedded_inputs
-            self.encoder_outputs = encoder_outputs
-            self.encoder_outputs_ = encoder_outputs_
-            self.LR_outputs = LR_outputs
-            self.postnet_fsmn_outputs = postnet_fsmn_outputs
-
-            self.pitch_embeddings = pitch_embeddings
-            self.energy_embeddings = energy_embeddings
-
-            self.attns = attns
-            self.attention_x = attention_x
-            self.attention_h = attention_h
-            self.X_band_width = X_band_width
-            self.H_band_width = H_band_width
-
-    def add_loss(self):
-        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
-        with tf.variable_scope('loss') as _:
-            hp = self._hparams
-            mask = tf.sequence_mask(
-                self.output_lengths,
-                tf.shape(self.mel_targets)[1],
-                dtype=tf.float32)
-            valid_outputs = tf.reduce_sum(mask)
-
-            mask_input = tf.sequence_mask(
-                self.input_lengths,
-                tf.shape(self.durations)[1],
-                dtype=tf.float32)
-            valid_inputs = tf.reduce_sum(mask_input)
-
-            # mel loss
-            if self.uv_masks_ is not None:
-                valid_outputs_mask = tf.reduce_sum(
-                    tf.expand_dims(mask, -1) * self.uv_masks_)
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-            else:
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-
-            # duration loss
-            self.duration_loss = tf.reduce_sum(
-                tf.abs(
-                    tf.log(tf.cast(self.durations, tf.float32) + 1)
-                    - self.duration_outputs) * mask_input) / valid_inputs
-
-            # pitch contour loss
-            self.pitch_contour_loss = tf.reduce_sum(
-                tf.abs(self.pitch_contours - self.pitch_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # energy contour loss
-            self.energy_contour_loss = tf.reduce_sum(
-                tf.abs(self.energy_contours - self.energy_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # final loss
-            self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
-                + self.pitch_contour_loss + self.energy_contour_loss
-
-            # guided attention loss
-            self.guided_attention_loss = tf.constant(0.0)
-            if hp.guided_attention:
-                i0 = tf.constant(0)
-                loss0 = tf.constant(0.0)
-
-                def c(i, _):
-                    return tf.less(i, tf.shape(mel_targets)[0])
-
-                def loop_body(i, loss):
-                    decoder_input_lengths = tf.cast(
-                        self.output_lengths / hp.outputs_per_step, tf.int32)
-                    input_len = decoder_input_lengths[i]
-                    output_len = decoder_input_lengths[i]
-                    input_w = tf.expand_dims(
-                        tf.range(tf.cast(input_len, dtype=tf.float32)),
-                        axis=1) / tf.cast(
-                            input_len, dtype=tf.float32)  # [T_in, 1]
-                    output_w = tf.expand_dims(
-                        tf.range(tf.cast(output_len, dtype=tf.float32)),
-                        axis=0) / tf.cast(
-                            output_len, dtype=tf.float32)  # [1, T_out]
-                    guided_attention_w = 1.0 - tf.exp(
-                        -(1 / hp.guided_attention_2g_squared)
-                        * tf.square(input_w - output_w))  # [T_in, T_out]
-                    guided_attention_w = tf.expand_dims(
-                        guided_attention_w, axis=0)  # [1, T_in, T_out]
-                    # [hp.decoder_num_heads, T_in, T_out]
-                    guided_attention_w = tf.tile(guided_attention_w,
-                                                 [hp.decoder_num_heads, 1, 1])
-                    loss_i = tf.constant(0.0)
-                    for j in range(hp.decoder_num_layers):
-                        loss_i += tf.reduce_mean(
-                            self.attention_h[j][i, :, :input_len, :output_len]
-                            * guided_attention_w)
-
-                    return [tf.add(i, 1), tf.add(loss, loss_i)]
-
-                _, loss = tf.while_loop(
-                    c,
-                    loop_body,
-                    loop_vars=[i0, loss0],
-                    parallel_iterations=hp.batch_size)
-                self.guided_attention_loss = loss / hp.batch_size
-                self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss
-
-    def add_optimizer(self, global_step):
-        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
-
-        Args:
-          global_step: int32 scalar Tensor representing current global step in training
-        '''
-        with tf.variable_scope('optimizer') as _:
-            hp = self._hparams
-            if hp.decay_learning_rate:
-                self.learning_rate = _learning_rate_decay(
-                    hp.initial_learning_rate, global_step)
-            else:
-                self.learning_rate = tf.convert_to_tensor(
-                    hp.initial_learning_rate)
-            optimizer = tf.train.AdamOptimizer(self.learning_rate,
-                                               hp.adam_beta1, hp.adam_beta2)
-            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
-            self.gradients = gradients
-            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
-
-            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
-            # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(
-                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
-                self.optimize = optimizer.apply_gradients(
-                    zip(clipped_gradients, variables), global_step=global_step)
-
-
-def _learning_rate_decay(init_lr, global_step):
-    # Noam scheme from tensor2tensor:
-    warmup_steps = 4000.0
-    step = tf.cast(global_step + 1, dtype=tf.float32)
-    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
-                                                    step**-0.5)
diff --git a/modelscope/models/audio/tts/models/self_attention_decoder.py b/modelscope/models/audio/tts/models/self_attention_decoder.py
deleted file mode 100755
index 9cf3fcaa..00000000
--- a/modelscope/models/audio/tts/models/self_attention_decoder.py
+++ /dev/null
@@ -1,817 +0,0 @@
-"""Define self-attention decoder."""
-
-import sys
-
-import tensorflow as tf
-
-from . import compat, transformer
-from .am_models import decoder_prenet
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionDecoder():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 prenet_units=256,
-                 dense_units=128,
-                 num_mels=80,
-                 outputs_per_step=3,
-                 X_band_width=None,
-                 H_band_width=None,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-        self.prenet_units = prenet_units
-        self.dense_units = dense_units
-        self.num_mels = num_mels
-        self.outputs_per_step = outputs_per_step
-        self.X_band_width = X_band_width
-        self.H_band_width = H_band_width
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _init_attn(self, dtype=tf.float32):
-        attn = []
-        for layer in range(self.num_layers):
-            attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
-        return attn
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-
-        # [N, T_out, self.dense_units] or [N, 1, self.dense_units]
-        prenet_outputs = decoder_prenet(inputs, self.prenet_units,
-                                        self.dense_units, mode)
-        if step is None:
-            decoder_inputs = tf.concat(
-                [memory, prenet_outputs],
-                axis=-1)  # [N, T_out, memory_size + self.dense_units]
-        else:
-            decoder_inputs = tf.concat(
-                [memory[:, step:step + 1, :], prenet_outputs],
-                axis=-1)  # [N, 1, memory_size + self.dense_units]
-        decoder_inputs = tf.layers.dense(
-            decoder_inputs, units=self.dense_units)
-
-        inputs = decoder_inputs
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        # last_attention = None
-
-        X_band_width_tmp = -1
-        H_band_width_tmp = -1
-        if self.X_band_width is not None:
-            X_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.X_band_width),
-                    lambda: -1, lambda: self.X_band_width),
-                dtype=tf.int64)
-        if self.H_band_width is not None:
-            H_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.H_band_width),
-                    lambda: -1, lambda: self.H_band_width),
-                dtype=tf.int64)
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1],
-                    band=X_band_width_tmp)  # [N, 1, T_out, T_out]
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            if step is None:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-            else:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)[:, :, step:step + 1, :]
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-
-        # last_attention = None
-        attns_x = []
-        attns_h = []
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = None
-                        if layer_cache is not None:
-                            memory_cache = layer_cache['memory'][i]
-                        scope_name = 'multi_head_{}'.format(i)
-                        if i == 0:
-                            scope_name = 'multi_head'
-                        with tf.variable_scope(scope_name):
-                            encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
-                                self.num_heads,
-                                transformer.norm(inputs),
-                                mem,
-                                mode,
-                                num_units=self.num_units,
-                                mask=decoder_mask,
-                                mask_h=mask,
-                                cache=layer_cache,
-                                cache_h=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True,
-                                layer_name=layer_name,
-                                X_band_width=self.X_band_width)
-                            attns_x.append(attn_x)
-                            attns_h.append(attn_h)
-                            context = transformer.drop_and_add(
-                                inputs, encoded, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        outputs = transformer.norm(inputs)
-        outputs = tf.layers.dense(
-            outputs, units=self.num_mels * self.outputs_per_step)
-        return outputs, attns_x, attns_h
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention_x, attention_h = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, attention_x, attention_h
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-        attention_x = self._init_attn(dtype=dtype)
-        attention_h = self._init_attn(dtype=dtype)
-
-        def _fn(step, inputs, cache):
-            outputs, attention_x, attention_h = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            attention_x_tmp = []
-            for layer in range(len(attention_h)):
-                attention_x_tmp_l = tf.zeros_like(attention_h[layer])
-                if self.X_band_width is not None:
-                    pred = tf.less(step, self.X_band_width + 1)
-                    attention_x_tmp_l_1 = tf.cond(pred,  # yapf:disable
-                                                  lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
-                                                  lambda: tf.concat([
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      :step - self.X_band_width],
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      step - self.X_band_width:step + 1]
-                                                                    + attention_x[layer]],
-                                                                    axis=-1))  # yapf:disable
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
-                                  axis=-1))
-                else:
-                    attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([
-                            attention_x_tmp_l_1 + attention_x[layer],
-                            attention_x_tmp_l_2
-                        ], axis=-1))  # yapf:disable
-            attention_x = attention_x_tmp
-            return outputs, cache, attention_x, attention_h
-
-        return _fn, cache, attention_x, attention_h
-
-    def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
-                                  mode, memory, memory_sequence_length):
-        batch_size = tf.shape(init_decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode(
-            step_fn,
-            init_decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
-                                                  maximum_iterations, mode,
-                                                  memory,
-                                                  memory_sequence_length):
-        batch_size = tf.shape(decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
-            step_fn,
-            decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode(self,
-                       step_fn,
-                       init_decoder_input,
-                       init_cache=None,
-                       init_attn_x=None,
-                       init_attn_h=None,
-                       maximum_iterations=None,
-                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs, cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, output[:, :, -self.
-                                           num_mels:], outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, init_decoder_input, outputs,
-                       init_attn_x, init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_cache),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_decoder_input), tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def dynamic_decode_teacher_forcing(self,
-                                       step_fn,
-                                       decoder_input,
-                                       init_cache=None,
-                                       init_attn_x=None,
-                                       init_attn_h=None,
-                                       maximum_iterations=None,
-                                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs[:, step:step + 1, :],
-                cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, inputs, outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
-                       init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_cache), decoder_input.shape,
-                              tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def _get_shape_invariants(self, tensor):
-        """Returns the shape of the tensor but sets middle dims to None."""
-        if isinstance(tensor, tf.TensorArray):
-            shape = None
-        else:
-            shape = tensor.shape.as_list()
-            for i in range(1, len(shape) - 1):
-                shape[i] = None
-        return tf.TensorShape(shape)
-
-
-class SelfAttentionDecoderOri():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        last_attention = None
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1])
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            memory_mask = [
-                transformer.build_sequence_mask(
-                    length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(m)[1])
-                for m, length in zip(memory, memory_sequence_length)
-            ]
-
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if self.self_attention_type == 'scaled_dot':
-                    with tf.variable_scope('masked_multi_head'):
-                        encoded = transformer.multi_head_attention(
-                            self.num_heads,
-                            transformer.norm(inputs),
-                            None,
-                            mode,
-                            num_units=self.num_units,
-                            mask=decoder_mask,
-                            cache=layer_cache,
-                            dropout=self.attention_dropout)
-                        last_context = transformer.drop_and_add(
-                            inputs, encoded, mode, dropout=self.dropout)
-                elif self.self_attention_type == 'average':
-                    with tf.variable_scope('average_attention'):
-                        # Cumulative average.
-                        x = transformer.norm(inputs)
-                        y = transformer.cumulative_average(
-                            x,
-                            decoder_mask if cache is None else step,
-                            cache=layer_cache)
-                        # FFN.
-                        y = transformer.feed_forward(
-                            y,
-                            self.ffn_inner_dim,
-                            mode,
-                            dropout=self.relu_dropout)
-                        # Gating layer.
-                        z = tf.layers.dense(
-                            tf.concat([x, y], -1), self.num_units * 2)
-                        i, f = tf.split(z, 2, axis=-1)
-                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
-                        last_context = transformer.drop_and_add(
-                            inputs, y, mode, dropout=self.dropout)
-
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = layer_cache['memory'][i] if layer_cache is not None else None  # yapf:disable
-                        with tf.variable_scope('multi_head' if i
-                                               == 0 else 'multi_head_%d' % i):  # yapf:disable
-                            context, last_attention = transformer.multi_head_attention(
-                                self.num_heads,
-                                transformer.norm(last_context),
-                                mem,
-                                mode,
-                                mask=mask,
-                                cache=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True)
-                            last_context = transformer.drop_and_add(
-                                last_context,
-                                context,
-                                mode,
-                                dropout=self.dropout)
-                            if i > 0:  # Do not return attention in case of multi source.
-                                last_attention = None
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(last_context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        last_context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        if last_attention is not None:
-            # The first head of the last layer is returned.
-            first_head_attention = last_attention[:, 0]
-        else:
-            first_head_attention = None
-
-        outputs = transformer.norm(inputs)
-        return outputs, first_head_attention
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, None, attention
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-
-        def _fn(step, inputs, cache, mode):
-            inputs = tf.expand_dims(inputs, 1)
-            outputs, attention = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            outputs = tf.squeeze(outputs, axis=1)
-            if attention is not None:
-                attention = tf.squeeze(attention, axis=1)
-            return outputs, cache, attention
-
-        return _fn, cache
diff --git a/modelscope/models/audio/tts/models/self_attention_encoder.py b/modelscope/models/audio/tts/models/self_attention_encoder.py
deleted file mode 100755
index ce4193dc..00000000
--- a/modelscope/models/audio/tts/models/self_attention_encoder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-"""Define the self-attention encoder."""
-
-import tensorflow as tf
-
-from . import transformer
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionEncoder():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])
-
-        mask_FF = tf.squeeze(
-            transformer.build_sequence_mask(
-                sequence_length, maximum_length=tf.shape(inputs)[1]),
-            axis=1)
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout,
-                        mask=mask_FF)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
-
-
-class SelfAttentionEncoderOri():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])  # [N, 1, 1, T_out]
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
diff --git a/modelscope/models/audio/tts/models/transformer.py b/modelscope/models/audio/tts/models/transformer.py
deleted file mode 100755
index a9f0bedc..00000000
--- a/modelscope/models/audio/tts/models/transformer.py
+++ /dev/null
@@ -1,1157 +0,0 @@
-"""Define layers related to the Google's Transformer model."""
-
-import tensorflow as tf
-
-from . import compat, fsmn
-
-
-def tile_sequence_length(sequence_length, num_heads):
-    """Tiles lengths :obj:`num_heads` times.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-
-    Returns:
-      A ``tf.Tensor`` where each length is replicated :obj:`num_heads` times.
-    """
-    sequence_length = tf.tile(sequence_length, [num_heads])
-    sequence_length = tf.reshape(sequence_length, [num_heads, -1])
-    sequence_length = tf.transpose(sequence_length, perm=[1, 0])
-    sequence_length = tf.reshape(sequence_length, [-1])
-    return sequence_length
-
-
-def build_sequence_mask(sequence_length,
-                        num_heads=None,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = tf.expand_dims(mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_sequence_mask_window(sequence_length,
-                               left_window_size=-1,
-                               right_window_size=-1,
-                               num_heads=None,
-                               maximum_length=None,
-                               dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _window_mask(
-        sequence_length,
-        left_window_size=left_window_size,
-        right_window_size=right_window_size,
-        maximum_length=maximum_length,
-        dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def _lower_triangle_mask(sequence_length,
-                         maximum_length=None,
-                         dtype=tf.float32,
-                         band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, band, 0)
-    return mask
-
-
-def _higher_triangle_mask(sequence_length,
-                          maximum_length=None,
-                          dtype=tf.float32,
-                          band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, 0, band)
-    return mask
-
-
-def _window_mask(sequence_length,
-                 left_window_size=-1,
-                 right_window_size=-1,
-                 maximum_length=None,
-                 dtype=tf.float32):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    left_window_size = tf.minimum(
-        tf.cast(left_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    right_window_size = tf.minimum(
-        tf.cast(right_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    mask = tf.matrix_band_part(mask, left_window_size, right_window_size)
-    return mask
-
-
-def build_future_mask(sequence_length,
-                      num_heads=None,
-                      maximum_length=None,
-                      dtype=tf.float32,
-                      band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_history_mask(sequence_length,
-                       num_heads=None,
-                       maximum_length=None,
-                       dtype=tf.float32,
-                       band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _higher_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def cumulative_average_mask(sequence_length,
-                            maximum_length=None,
-                            dtype=tf.float32):
-    """Builds the mask to compute the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=2)
-    weight = tf.range(1, tf.cast(tf.shape(mask)[1] + 1, dtype), dtype=dtype)
-    mask /= tf.expand_dims(weight, 1)
-    return mask
-
-
-def cumulative_average(inputs, mask_or_step, cache=None):
-    """Computes the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      inputs: The sequence to average. A tensor of shape :math:`[B, T, D]`.
-      mask_or_step: If :obj:`cache` is set, this is assumed to be the current step
-        of the dynamic decoding. Otherwise, it is the mask matrix used to compute
-        the cumulative average.
-      cache: A dictionnary containing the cumulative average of the previous step.
-
-    Returns:
-      The cumulative average, a tensor of the same shape and type as :obj:`inputs`.
-    """
-    if cache is not None:
-        step = tf.cast(mask_or_step, inputs.dtype)
-        aa = (inputs + step * cache['prev_g']) / (step + 1.0)
-        cache['prev_g'] = aa
-        return aa
-    else:
-        mask = mask_or_step
-        return tf.matmul(mask, inputs)
-
-
-def fused_projection(inputs, num_units, num_outputs=1):
-    """Projects the same input into multiple output spaces.
-
-    Args:
-      inputs: The inputs to project.
-      num_units: The number of output units of each space.
-      num_outputs: The number of output spaces.
-
-    Returns:
-      :obj:`num_outputs` ``tf.Tensor`` of depth :obj:`num_units`.
-    """
-    return tf.split(
-        tf.layers.conv1d(inputs, num_units * num_outputs, 1),
-        num_outputs,
-        axis=2)
-
-
-def split_heads(inputs, num_heads):
-    """Splits a tensor in depth.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, T, D]`.
-      num_heads: The number of heads :math:`H`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, H, T, D / H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    outputs = tf.reshape(inputs, [
-        tf.shape(inputs)[0],
-        tf.shape(inputs)[1], num_heads, depth // num_heads
-    ])
-    outputs = tf.transpose(outputs, perm=[0, 2, 1, 3])
-    return outputs
-
-
-def combine_heads(inputs):
-    """Concatenates heads.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, H, T, D]`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, T, D * H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    num_heads = static_shape[1]
-    outputs = tf.transpose(inputs, perm=[0, 2, 1, 3])
-    outputs = tf.reshape(
-        outputs,
-        [tf.shape(outputs)[0],
-         tf.shape(outputs)[1], depth * num_heads])
-    return outputs
-
-
-def dot_product_attention(queries, keys, values, mode, mask=None, dropout=0.0):
-    """Computes the dot product attention.
-
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    dot = tf.matmul(queries, keys, transpose_b=True)
-
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-
-    softmax = tf.nn.softmax(tf.cast(dot, tf.float32))
-    attn = tf.cast(softmax, dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def dot_product_attention_wpa(num_heads,
-                              queries,
-                              keys,
-                              values,
-                              mode,
-                              attention_left_window=-1,
-                              attention_right_window=0,
-                              mask=None,
-                              max_id_cache=None,
-                              mono=False,
-                              peak_delay=-1,
-                              dropout=0.0):
-    """
-    Computes the dot product attention.
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    # Dot product between queries and keys.
-    dot = tf.matmul(queries, keys, transpose_b=True)
-    depth = tf.shape(dot)[-1]
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-    # wpa
-    max_id = tf.math.argmax(input=dot, axis=-1)
-    # peak delay
-    if peak_delay > 0:
-        if max_id_cache is not None:
-            M = tf.cast(max_id_cache['pre_max_id'], dtype=max_id.dtype)
-            inputs_len = tf.math.minimum(
-                M + peak_delay, tf.cast(depth - 1, dtype=max_id.dtype))
-            delay_mask = tf.sequence_mask(
-                inputs_len, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * delay_mask
-                + ((1.0 - delay_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-    # mono
-    if mono:
-        if max_id_cache is None:
-            d = tf.shape(max_id)[-1]
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, d])
-            tmp_max_id = tf.slice(
-                tmp_max_id, [0, 0, 0],
-                [tf.shape(tmp_max_id)[0],
-                 tf.shape(tmp_max_id)[1], d - 1])
-            zeros = tf.zeros(
-                shape=(tf.shape(tmp_max_id)[0], tf.shape(tmp_max_id)[1], 1),
-                dtype=max_id.dtype)
-            tmp_max_id = tf.concat([zeros, tmp_max_id], axis=-1)
-            mask1 = tf.sequence_mask(
-                tmp_max_id, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32)
-                * (1.0 - mask1) + mask1 * tf.float32.min, dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-        else:
-            # eval
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, 1])
-            max_id_cache['pre_max_id'] = tmp_max_id
-    # right_mask
-    right_offset = tf.constant(attention_right_window, dtype=max_id.dtype)
-    right_len = tf.math.minimum(max_id + right_offset,
-                                tf.cast(depth - 1, dtype=max_id.dtype))
-    right_mask = tf.sequence_mask(right_len, maxlen=depth, dtype=tf.float32)
-    dot = tf.cast(
-        tf.cast(dot, tf.float32) * right_mask
-        + ((1.0 - right_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-    # left_mask
-    if attention_left_window > 0:
-        left_offset = tf.constant(attention_left_window, dtype=max_id.dtype)
-        left_len = tf.math.maximum(max_id - left_offset,
-                                   tf.cast(0, dtype=max_id.dtype))
-        left_mask = tf.sequence_mask(left_len, maxlen=depth, dtype=tf.float32)
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * (1.0 - left_mask)
-            + (left_mask * tf.float32.min), dot.dtype)  # yapf:disable
-    # Compute attention weights.
-    attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    # Compute attention context.
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def multi_head_attention(num_heads,
-                         queries,
-                         memory,
-                         mode,
-                         num_units=None,
-                         mask=None,
-                         cache=None,
-                         dropout=0.0,
-                         return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_PNCA(num_heads,
-                              queries,
-                              memory,
-                              mode,
-                              num_units=None,
-                              mask=None,
-                              mask_h=None,
-                              cache=None,
-                              cache_h=None,
-                              dropout=0.0,
-                              return_attention=False,
-                              X_band_width=None,
-                              layer_name='multi_head'):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # X
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        if X_band_width is not None:
-            keys_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: keys, lambda: tf.cond(
-                    tf.less(tf.shape(keys)[2], X_band_width), lambda: keys,
-                    lambda: keys[:, :, -X_band_width:, :])
-            )  # not support X_band_width == 0
-            values_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: values, lambda: tf.cond(
-                    tf.less(tf.shape(values)[2], X_band_width), lambda: values,
-                    lambda: values[:, :, -X_band_width:, :]))
-            cache['self_keys'] = keys_band
-            cache['self_values'] = values_band
-        else:
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    # H
-    if cache_h is not None:
-
-        def _project_and_split():
-            k, v = fused_projection(memory, num_units, num_outputs=2)
-            return split_heads(k, num_heads), split_heads(v, num_heads)
-
-        keys_h, values_h = tf.cond(
-            tf.equal(tf.shape(cache_h['memory_keys'])[2], 0),
-            true_fn=_project_and_split,
-            false_fn=lambda:
-            (cache_h['memory_keys'], cache_h['memory_values']))
-        cache_h['memory_keys'] = keys_h
-        cache_h['memory_values'] = values_h
-    else:
-        keys_h, values_h = fused_projection(memory, num_units, num_outputs=2)
-        keys_h = split_heads(keys_h, num_heads)
-        values_h = split_heads(values_h, num_heads)
-
-    heads_h, attn_h = dot_product_attention(
-        queries, keys_h, values_h, mode, mask=mask_h, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined_h = combine_heads(heads_h)
-    outputs_h = tf.layers.conv1d(combined_h, num_units, 1)
-
-    # ADD
-    outputs = outputs + outputs_h
-
-    # RETURN
-    return outputs, attn, attn_h
-
-
-def multi_head_attention_memory(num_heads,
-                                queries,
-                                memory,
-                                mode,
-                                num_memory=None,
-                                num_units=None,
-                                mask=None,
-                                cache=None,
-                                dropout=0.0,
-                                return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # PERSISTENT MEMORY
-    # key memory
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        # value memory
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        # concat memory
-        if num_memory is not None:
-            key_m_expand = tf.tile(
-                tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-            value_m_expand = tf.tile(
-                tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-            keys = tf.concat([key_m_expand, keys], axis=1)
-            values = tf.concat([value_m_expand, values], axis=1)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def Ci_Cd_Memory(num_heads,
-                 queries,
-                 mode,
-                 filter_size=None,
-                 num_memory=None,
-                 num_units=None,
-                 fsmn_mask=None,
-                 san_mask=None,
-                 cache=None,
-                 shift=None,
-                 dropout=0.0,
-                 return_attention=False):
-    """
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-    # PERSISTENT MEMORY
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-    # fsmn memory block
-    if shift is not None:
-        # encoder
-        fsmn_memory = fsmn.MemoryBlockV2(
-            values,
-            filter_size,
-            mode,
-            shift=shift,
-            mask=fsmn_mask,
-            dropout=dropout)
-    else:
-        # decoder
-        fsmn_memory = fsmn.UniMemoryBlock(
-            values,
-            filter_size,
-            mode,
-            cache=cache,
-            mask=fsmn_mask,
-            dropout=dropout)
-
-    # concat persistent memory
-    if num_memory is not None:
-        key_m_expand = tf.tile(
-            tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-        value_m_expand = tf.tile(
-            tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-        keys = tf.concat([key_m_expand, keys], axis=1)
-        values = tf.concat([value_m_expand, values], axis=1)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        cache['self_keys'] = keys
-        cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=san_mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-    outputs = outputs + fsmn_memory
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_wpa(num_heads,
-                             queries,
-                             memory,
-                             mode,
-                             attention_left_window=-1,
-                             attention_right_window=0,
-                             num_units=None,
-                             mask=None,
-                             cache=None,
-                             max_id_cache=None,
-                             dropout=0.0,
-                             mono=False,
-                             peak_delay=-1,
-                             return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention_wpa(
-        num_heads,
-        queries,
-        keys,
-        values,
-        mode,
-        attention_left_window=attention_left_window,
-        attention_right_window=attention_right_window,
-        mask=mask,
-        max_id_cache=max_id_cache,
-        mono=mono,
-        peak_delay=peak_delay,
-        dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def feed_forward(x, inner_dim, mode, dropout=0.0, mask=None):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    if mask is not None:
-        x = x * tf.expand_dims(mask, -1)
-
-    inner = tf.layers.conv1d(
-        x, inner_dim, 3, padding='same', activation=tf.nn.relu)
-
-    if mask is not None:
-        inner = inner * tf.expand_dims(mask, -1)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def feed_forward_ori(x, inner_dim, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    inner = tf.layers.conv1d(x, inner_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.1):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-    return outputs
-
-
-class FeedForwardNetwork(tf.keras.layers.Layer):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 inner_dim,
-                 output_dim,
-                 dropout=0.1,
-                 activation=tf.nn.relu,
-                 **kwargs):
-        """Initializes this layer.
-
-        Args:
-          inner_dim: The number of units of the inner linear transformation.
-          output_dim: The number of units of the ouput linear transformation.
-          dropout: The probability to drop units from the activation output.
-          activation: The activation function to apply between the two linear
-            transformations.
-          kwargs: Additional layer arguments.
-        """
-        super(FeedForwardNetwork, self).__init__(**kwargs)
-        self.inner = tf.keras.layers.Dense(
-            inner_dim, activation=activation, name='inner')
-        self.outer = tf.keras.layers.Dense(output_dim, name='outer')
-        self.dropout = dropout
-
-    def call(self, inputs, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer."""
-        inner = self.inner(inputs)
-        inner = tf.layers.dropout(inner, self.dropout, training=training)
-        return self.outer(inner)
-
-
-class MultiHeadAttention(tf.keras.layers.Layer):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 num_heads,
-                 num_units,
-                 dropout=0.1,
-                 return_attention=False,
-                 **kwargs):
-        """Initializes this layers.
-
-        Args:
-          num_heads: The number of attention heads.
-          num_units: The number of hidden units.
-          dropout: The probability to drop units from the inputs.
-          return_attention: If ``True``, also return the attention weights of the
-            first head.
-          kwargs: Additional layer arguments.
-        """
-        super(MultiHeadAttention, self).__init__(**kwargs)
-        if num_units % num_heads != 0:
-            raise ValueError(
-                'Multi head attention requires that num_units is a'
-                ' multiple of %s' % num_heads)
-        self.num_heads = num_heads
-        self.num_units = num_units
-        self.linear_queries = tf.keras.layers.Dense(
-            num_units, name='linear_queries')
-        self.linear_keys = tf.keras.layers.Dense(num_units, name='linear_keys')
-        self.linear_values = tf.keras.layers.Dense(
-            num_units, name='linear_values')
-        self.linear_output = tf.keras.layers.Dense(
-            num_units, name='linear_output')
-        self.dropout = dropout
-        self.return_attention = return_attention
-
-    def call(self, inputs, memory=None, mask=None, cache=None, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer.
-
-        Args:
-          inputs: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-          memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-            If ``None``, computes self-attention.
-          mask: A ``tf.Tensor`` applied to the dot product.
-          cache: A dictionary containing pre-projected keys and values.
-          training: Run in training mode.
-
-        Returns:
-          A tuple with the attention context, the updated cache and the attention
-          probabilities of the first head (if :obj:`return_attention` is ``True``).
-        """
-
-        def _compute_kv(x):
-            keys = self.linear_keys(x)
-            keys = split_heads(keys, self.num_heads)
-            values = self.linear_values(x)
-            values = split_heads(values, self.num_heads)
-            return keys, values
-
-        # Compute queries.
-        queries = self.linear_queries(inputs)
-        queries = split_heads(queries, self.num_heads)
-        queries *= (self.num_units // self.num_heads)**-0.5
-
-        # Compute keys and values.
-        if memory is None:
-            keys, values = _compute_kv(inputs)
-            if cache:
-                keys = tf.concat([cache[0], keys], axis=2)
-                values = tf.concat([cache[1], values], axis=2)
-        else:
-            if cache:
-                if not self.linear_keys.built:
-                    # Ensure that the variable names are not impacted by the tf.cond name
-                    # scope if the layers have not already been built.
-                    with tf.name_scope(self.linear_keys.name):
-                        self.linear_keys.build(memory.shape)
-                    with tf.name_scope(self.linear_values.name):
-                        self.linear_values.build(memory.shape)
-                keys, values = tf.cond(
-                    tf.equal(tf.shape(cache[0])[2], 0),
-                    true_fn=lambda: _compute_kv(memory),
-                    false_fn=lambda: cache)
-            else:
-                keys, values = _compute_kv(memory)
-
-        cache = (keys, values)
-
-        # Dot product attention.
-        dot = tf.matmul(queries, keys, transpose_b=True)
-        if mask is not None:
-            mask = tf.expand_dims(tf.cast(mask, tf.float32),
-                                  1)  # Broadcast on heads dimension.
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * mask
-                + ((1.0 - mask) * tf.float32.min), dot.dtype)  # yapf:disable
-        attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-        drop_attn = tf.layers.dropout(attn, self.dropout, training=training)
-        heads = tf.matmul(drop_attn, values)
-
-        # Concatenate all heads output.
-        combined = combine_heads(heads)
-        outputs = self.linear_output(combined)
-        if self.return_attention:
-            return outputs, cache, attn
-        return outputs, cache
diff --git a/modelscope/models/audio/tts/models/utils.py b/modelscope/models/audio/tts/models/utils.py
deleted file mode 100755
index 03e1ef8c..00000000
--- a/modelscope/models/audio/tts/models/utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import glob
-import os
-
-import matplotlib
-import matplotlib.pylab as plt
-import torch
-from torch.nn.utils import weight_norm
-
-matplotlib.use('Agg')
-
-
-def plot_spectrogram(spectrogram):
-    fig, ax = plt.subplots(figsize=(10, 2))
-    im = ax.imshow(
-        spectrogram, aspect='auto', origin='lower', interpolation='none')
-    plt.colorbar(im, ax=ax)
-
-    fig.canvas.draw()
-    plt.close()
-
-    return fig
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        weight_norm(m)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    print("Loading '{}'".format(filepath))
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    print('Complete.')
-    return checkpoint_dict
-
-
-def save_checkpoint(filepath, obj):
-    print('Saving checkpoint to {}'.format(filepath))
-    torch.save(obj, filepath)
-    print('Complete.')
-
-
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '????????')
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return None
-    return sorted(cp_list)[-1]
diff --git a/modelscope/models/audio/tts/models/utils/__init__.py b/modelscope/models/audio/tts/models/utils/__init__.py
new file mode 100644
index 00000000..e07f08ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .utils import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/utils/utils.py b/modelscope/models/audio/tts/models/utils/utils.py
new file mode 100755
index 00000000..17ac8aee
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import glob
+import os
+import shutil
+
+import matplotlib
+import matplotlib.pylab as plt
+import torch
+
+matplotlib.use('Agg')
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram, aspect='auto', origin='lower', interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_alignment(alignment, info=None):
+    fig, ax = plt.subplots()
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Input timestep'
+    if info is not None:
+        xlabel += '\t' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Output timestep')
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    torch.save(obj, filepath)
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????.pkl')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+class ValueWindow():
+
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1):] + [x]
+
+    @property
+    def sum(self):
+        return sum(self._values)
+
+    @property
+    def count(self):
+        return len(self._values)
+
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+
+    def reset(self):
+        self._values = []
+
+
+def get_model_size(model):
+    param_num = sum([p.numel() for p in model.parameters() if p.requires_grad])
+    param_size = param_num * 4 / 1024 / 1024
+    return param_size
+
+
+def get_grad_norm(model):
+    total_norm = 0
+    params = [
+        p for p in model.parameters() if p.grad is not None and p.requires_grad
+    ]
+    for p in params:
+        param_norm = p.grad.detach().data.norm(2)
+        total_norm += param_norm.item()**2
+    total_norm = total_norm**0.5
+    return total_norm
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size,
+                                                       -1).to(lengths.device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+
+    return mask
diff --git a/modelscope/models/audio/tts/models/vocoder_models.py b/modelscope/models/audio/tts/models/vocoder_models.py
deleted file mode 100755
index c46a9204..00000000
--- a/modelscope/models/audio/tts/models/vocoder_models.py
+++ /dev/null
@@ -1,516 +0,0 @@
-from distutils.version import LooseVersion
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
-
-from .utils import get_padding, init_weights
-
-is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
-
-
-def stft(x, fft_size, hop_size, win_length, window):
-    """Perform STFT and convert to magnitude spectrogram.
-
-    Args:
-        x (Tensor): Input signal tensor (B, T).
-        fft_size (int): FFT size.
-        hop_size (int): Hop size.
-        win_length (int): Window length.
-        window (str): Window function type.
-
-    Returns:
-        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-
-    """
-    if is_pytorch_17plus:
-        x_stft = torch.stft(
-            x, fft_size, hop_size, win_length, window, return_complex=False)
-    else:
-        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
-    real = x_stft[..., 0]
-    imag = x_stft[..., 1]
-
-    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
-    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
-
-
-LRELU_SLOPE = 0.1
-
-
-def get_padding_casual(kernel_size, dilation=1):
-    return int(kernel_size * dilation - dilation)
-
-
-class Conv1dCasual(torch.nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 padding_mode='zeros'):
-        super(Conv1dCasual, self).__init__()
-        self.pad = padding
-        self.conv1d = weight_norm(
-            Conv1d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=0,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-                padding_mode=padding_mode))
-        self.conv1d.apply(init_weights)
-
-    def forward(self, x):  # bdt
-        # described starting from the last dimension and moving forward.
-        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
-        x = self.conv1d(x)
-        return x
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.conv1d)
-
-
-class ConvTranspose1dCausal(torch.nn.Module):
-    """CausalConvTranspose1d module with customized initialization."""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding=0):
-        """Initialize CausalConvTranspose1d module."""
-        super(ConvTranspose1dCausal, self).__init__()
-        self.deconv = weight_norm(
-            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
-        self.stride = stride
-        self.deconv.apply(init_weights)
-        self.pad = kernel_size - stride
-
-    def forward(self, x):
-        """Calculate forward propagation.
-        Args:
-            x (Tensor): Input tensor (B, in_channels, T_in).
-        Returns:
-            Tensor: Output tensor (B, out_channels, T_out).
-        """
-        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
-        return self.deconv(x)[:, :, :-self.pad]
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.deconv)
-
-
-class ResBlock1(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=dilation[i],
-                padding=get_padding_casual(kernel_size, dilation[i]))
-            for i in range(len(dilation))
-        ])
-
-        self.convs2 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=1,
-                padding=get_padding_casual(kernel_size, 1))
-            for i in range(len(dilation))
-        ])
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            layer.remove_weight_norm()
-        for layer in self.convs2:
-            layer.remove_weight_norm()
-
-
-class Generator(torch.nn.Module):
-
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        print('num_kernels={}, num_upsamples={}'.format(
-            self.num_kernels, self.num_upsamples))
-        self.conv_pre = Conv1dCasual(
-            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-
-        self.ups = nn.ModuleList()
-        self.repeat_ups = nn.ModuleList()
-        for i, (u, k) in enumerate(
-                zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            upsample = nn.Sequential(
-                nn.Upsample(mode='nearest', scale_factor=u),
-                nn.LeakyReLU(LRELU_SLOPE),
-                Conv1dCasual(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    kernel_size=7,
-                    stride=1,
-                    padding=7 - 1))
-            self.repeat_ups.append(upsample)
-            self.ups.append(
-                ConvTranspose1dCausal(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    k,
-                    u,
-                    padding=(k - u) // 2))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2**(i + 1))
-            for j, (k, d) in enumerate(
-                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d))
-
-        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
-
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = torch.sin(x) + x
-            # transconv
-            x1 = F.leaky_relu(x, LRELU_SLOPE)
-            x1 = self.ups[i](x1)
-            # repeat
-            x2 = self.repeat_ups[i](x)
-            x = x1 + x2
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for layer in self.ups:
-            layer.remove_weight_norm()
-        for layer in self.repeat_ups:
-            layer[-1].remove_weight_norm()
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-        self.conv_pre.remove_weight_norm()
-        self.conv_post.remove_weight_norm()
-
-
-class DiscriminatorP(torch.nn.Module):
-
-    def __init__(self,
-                 period,
-                 kernel_size=5,
-                 stride=3,
-                 use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    1,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    128, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    128,
-                    512, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    512,
-                    1024, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), 'reflect')
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorP(2),
-            DiscriminatorP(3),
-            DiscriminatorP(5),
-            DiscriminatorP(7),
-            DiscriminatorP(11),
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorS(torch.nn.Module):
-
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiScaleDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorS(use_spectral_norm=True),
-            DiscriminatorS(),
-            DiscriminatorS(),
-        ])
-        from pytorch_wavelets import DWT1DForward
-        self.meanpools = nn.ModuleList(
-            [DWT1DForward(wave='db3', J=1),
-             DWT1DForward(wave='db3', J=1)])
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7))
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                yl, yh = self.meanpools[i - 1](y)
-                y = torch.cat([yl, yh[0]], dim=1)
-                y = self.convs[i - 1](y)
-                y = F.leaky_relu(y, LRELU_SLOPE)
-
-                yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
-                y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
-                y_hat = self.convs[i - 1](y_hat)
-                y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)
-
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorSTFT(torch.nn.Module):
-
-    def __init__(self,
-                 kernel_size=11,
-                 stride=2,
-                 use_spectral_norm=False,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window='hann_window'):
-        super(DiscriminatorSTFT, self).__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    fft_size // 2 + 1,
-                    32, (15, 1), (1, 1),
-                    padding=(get_padding(15, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
-        self.register_buffer('window', getattr(torch, window)(win_length))
-
-    def forward(self, wav):
-        wav = torch.squeeze(wav, 1)
-        x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = x.squeeze(-1)
-
-        return x, fmap
-
-
-class MultiSTFTDiscriminator(torch.nn.Module):
-
-    def __init__(
-        self,
-        fft_sizes=[1024, 2048, 512],
-        hop_sizes=[120, 240, 50],
-        win_lengths=[600, 1200, 240],
-        window='hann_window',
-    ):
-        super(MultiSTFTDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.discriminators += [
-                DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
-            ]
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-
-    return loss * 2
-
-
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1 - dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-
-    return loss, r_losses, g_losses
-
-
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        temp_loss = torch.mean((1 - dg)**2)
-        gen_losses.append(temp_loss)
-        loss += temp_loss
-
-    return loss, gen_losses
diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py
index 79f8068e..a9b55795 100644
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import os
@@ -11,13 +13,11 @@ from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
     TtsFrontendInitializeFailedException,
-    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion,
+    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException,
     TtsVoiceNotExistsException)
 from modelscope.utils.constant import Tasks
 from .voice import Voice
 
-import tensorflow as tf  # isort:skip
-
 __all__ = ['SambertHifigan']
 
 
@@ -28,14 +28,15 @@ class SambertHifigan(Model):
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
         if 'am' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing am!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing am!')
         if 'vocoder' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing vocoder!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing vocoder!')
         if 'lang_type' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing lang_type!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing lang_type!'
+            )
         am_cfg = kwargs['am']
         voc_cfg = kwargs['vocoder']
         # initialize frontend
@@ -47,10 +48,12 @@ class SambertHifigan(Model):
             zip_ref.extractall(model_dir)
         if not frontend.initialize(self.__res_path):
             raise TtsFrontendInitializeFailedException(
-                'resource invalid: {}'.format(self.__res_path))
+                'modelscope error: resource invalid: {}'.format(
+                    self.__res_path))
         if not frontend.set_lang_type(kwargs['lang_type']):
             raise TtsFrontendLanguageTypeInvalidException(
-                'language type invalid: {}'.format(kwargs['lang_type']))
+                'modelscope error: language type invalid: {}'.format(
+                    kwargs['lang_type']))
         self.__frontend = frontend
         zip_file = os.path.join(model_dir, 'voices.zip')
         self.__voice_path = os.path.join(model_dir, 'voices')
@@ -60,7 +63,8 @@ class SambertHifigan(Model):
         with open(voice_cfg_path, 'r') as f:
             voice_cfg = json.load(f)
         if 'voices' not in voice_cfg:
-            raise TtsModelConfigurationExcetion('voices invalid')
+            raise TtsModelConfigurationException(
+                'modelscope error: voices invalid')
         self.__voice = {}
         for name in voice_cfg['voices']:
             voice_path = os.path.join(self.__voice_path, name)
@@ -70,11 +74,13 @@ class SambertHifigan(Model):
         if voice_cfg['voices']:
             self.__default_voice_name = voice_cfg['voices'][0]
         else:
-            raise TtsVoiceNotExistsException('voices is empty in voices.json')
+            raise TtsVoiceNotExistsException(
+                'modelscope error: voices is empty in voices.json')
 
     def __synthesis_one_sentences(self, voice_name, text):
         if voice_name not in self.__voice:
-            raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists')
+            raise TtsVoiceNotExistsException(
+                f'modelscope error: Voice {voice_name} not exists')
         return self.__voice[voice_name].forward(text)
 
     def forward(self, text: str, voice_name: str = None):
diff --git a/modelscope/models/audio/tts/text/cleaners.py b/modelscope/models/audio/tts/text/cleaners.py
deleted file mode 100755
index 19d838d1..00000000
--- a/modelscope/models/audio/tts/text/cleaners.py
+++ /dev/null
@@ -1,89 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-
-import re
-
-from unidecode import unidecode
-
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'), ]]  # yapf:disable
-
-
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def expand_numbers(text):
-    return normalize_numbers(text)
-
-
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
-    return unidecode(text)
-
-
-def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def english_cleaners(text):
-    '''Pipeline for English text, including number and abbreviation expansion.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
diff --git a/modelscope/models/audio/tts/text/cmudict.py b/modelscope/models/audio/tts/text/cmudict.py
deleted file mode 100755
index b4da4be9..00000000
--- a/modelscope/models/audio/tts/text/cmudict.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import re
-
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
-    'Y', 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
-    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
-
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding='latin-1') as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {
-                word: pron
-                for word, pron in entries.items() if len(pron) == 1
-            }
-        self._entries = entries
-
-    def __len__(self):
-        return len(self._entries)
-
-    def lookup(self, word):
-        '''Returns list of ARPAbet pronunciations of the given word.'''
-        return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-            parts = line.split('  ')
-            word = re.sub(_alt_re, '', parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-
-
-def _get_pronunciation(s):
-    parts = s.strip().split(' ')
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return ' '.join(parts)
diff --git a/modelscope/models/audio/tts/text/symbols.py b/modelscope/models/audio/tts/text/symbols.py
deleted file mode 100644
index 63975abb..00000000
--- a/modelscope/models/audio/tts/text/symbols.py
+++ /dev/null
@@ -1,105 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text that has been run
-through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
-'''
-import codecs
-import os
-
-_pad = '_'
-_eos = '~'
-_mask = '@[MASK]'
-
-
-def load_symbols(dict_path, has_mask=True):
-    _characters = ''
-    _ch_symbols = []
-    sy_dict_name = 'sy_dict.txt'
-    sy_dict_path = os.path.join(dict_path, sy_dict_name)
-    f = codecs.open(sy_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_symbols.append(line)
-
-    _arpabet = ['@' + s for s in _ch_symbols]
-
-    # Export all symbols:
-    sy = list(_characters) + _arpabet + [_pad, _eos]
-    if has_mask:
-        sy.append(_mask)
-
-    _characters = ''
-
-    _ch_tones = []
-    tone_dict_name = 'tone_dict.txt'
-    tone_dict_path = os.path.join(dict_path, tone_dict_name)
-    f = codecs.open(tone_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_tones.append(line)
-
-    # Export all tones:
-    tone = list(_characters) + _ch_tones + [_pad, _eos]
-    if has_mask:
-        tone.append(_mask)
-
-    _characters = ''
-
-    _ch_syllable_flags = []
-    syllable_flag_name = 'syllable_flag_dict.txt'
-    syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
-    f = codecs.open(syllable_flag_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_syllable_flags.append(line)
-
-    # Export all syllable_flags:
-    syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos]
-    if has_mask:
-        syllable_flag.append(_mask)
-
-    _characters = ''
-
-    _ch_word_segments = []
-    word_segment_name = 'word_segment_dict.txt'
-    word_segment_path = os.path.join(dict_path, word_segment_name)
-    f = codecs.open(word_segment_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_word_segments.append(line)
-
-    # Export all syllable_flags:
-    word_segment = list(_characters) + _ch_word_segments + [_pad, _eos]
-    if has_mask:
-        word_segment.append(_mask)
-
-    _characters = ''
-
-    _ch_emo_types = []
-    emo_category_name = 'emo_category_dict.txt'
-    emo_category_path = os.path.join(dict_path, emo_category_name)
-    f = codecs.open(emo_category_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_emo_types.append(line)
-
-    emo_category = list(_characters) + _ch_emo_types + [_pad, _eos]
-    if has_mask:
-        emo_category.append(_mask)
-
-    _characters = ''
-
-    _ch_speakers = []
-    speaker_name = 'speaker_dict.txt'
-    speaker_path = os.path.join(dict_path, speaker_name)
-    f = codecs.open(speaker_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_speakers.append(line)
-
-    # Export all syllable_flags:
-    speaker = list(_characters) + _ch_speakers + [_pad, _eos]
-    if has_mask:
-        speaker.append(_mask)
-    return sy, tone, syllable_flag, word_segment, emo_category, speaker
diff --git a/modelscope/models/audio/tts/text/symbols_dict.py b/modelscope/models/audio/tts/text/symbols_dict.py
deleted file mode 100644
index e8f7ed19..00000000
--- a/modelscope/models/audio/tts/text/symbols_dict.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import re
-import sys
-
-from .cleaners import (basic_cleaners, english_cleaners,
-                       transliteration_cleaners)
-
-
-class SymbolsDict:
-
-    def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
-                 speaker, inputs_dim, lfeat_type_list):
-        self._inputs_dim = inputs_dim
-        self._lfeat_type_list = lfeat_type_list
-        self._sy_to_id = {s: i for i, s in enumerate(sy)}
-        self._id_to_sy = {i: s for i, s in enumerate(sy)}
-        self._tone_to_id = {s: i for i, s in enumerate(tone)}
-        self._id_to_tone = {i: s for i, s in enumerate(tone)}
-        self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
-        self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
-        self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
-        self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
-        self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
-        self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
-        self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
-        self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
-        print('_sy_to_id: ')
-        print(self._sy_to_id)
-        print('_tone_to_id: ')
-        print(self._tone_to_id)
-        print('_syllable_flag_to_id: ')
-        print(self._syllable_flag_to_id)
-        print('_word_segment_to_id: ')
-        print(self._word_segment_to_id)
-        print('_emo_category_to_id: ')
-        print(self._emo_category_to_id)
-        print('_speaker_to_id: ')
-        print(self._speaker_to_id)
-        self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-        self._cleaners = {
-            basic_cleaners.__name__: basic_cleaners,
-            transliteration_cleaners.__name__: transliteration_cleaners,
-            english_cleaners.__name__: english_cleaners
-        }
-
-    def _clean_text(self, text, cleaner_names):
-        for name in cleaner_names:
-            cleaner = self._cleaners.get(name)
-            if not cleaner:
-                raise Exception('Unknown cleaner: %s' % name)
-            text = cleaner(text)
-        return text
-
-    def _sy_to_sequence(self, sy):
-        return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]
-
-    def _arpabet_to_sequence(self, text):
-        return self._sy_to_sequence(['@' + s for s in text.split()])
-
-    def _should_keep_sy(self, s):
-        return s in self._sy_to_id and s != '_' and s != '~'
-
-    def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
-        sequence = []
-        if lfeat_type == 'sy':
-            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
-            this_lfeat_symbol_format = ''
-            index = 0
-            while index < len(this_lfeat_symbol):
-                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
-                    index] + '}' + ' '
-                index = index + 1
-            sequence = self.text_to_sequence(this_lfeat_symbol_format,
-                                             cleaner_names)
-        elif lfeat_type == 'tone':
-            sequence = self.tone_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'syllable_flag':
-            sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'word_segment':
-            sequence = self.word_segment_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'emo_category':
-            sequence = self.emo_category_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'speaker':
-            sequence = self.speaker_to_sequence(this_lfeat_symbol)
-        else:
-            raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-        return sequence
-
-    def text_to_sequence(self, text, cleaner_names):
-        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-          The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-          in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-          Args:
-            text: string to convert to a sequence
-            cleaner_names: names of the cleaner functions to run the text through
-
-          Returns:
-            List of integers corresponding to the symbols in the text
-        '''
-        sequence = []
-
-        # Check for curly braces and treat their contents as ARPAbet:
-        while len(text):
-            m = self._curly_re.match(text)
-            if not m:
-                sequence += self._sy_to_sequence(
-                    self._clean_text(text, cleaner_names))
-                break
-            sequence += self._sy_to_sequence(
-                self._clean_text(m.group(1), cleaner_names))
-            sequence += self._arpabet_to_sequence(m.group(2))
-            text = m.group(3)
-
-        # Append EOS token
-        sequence.append(self._sy_to_id['~'])
-        return sequence
-
-    def tone_to_sequence(self, tone):
-        tones = tone.strip().split(' ')
-        sequence = []
-        for this_tone in tones:
-            sequence.append(self._tone_to_id[this_tone])
-        sequence.append(self._tone_to_id['~'])
-        return sequence
-
-    def syllable_flag_to_sequence(self, syllable_flag):
-        syllable_flags = syllable_flag.strip().split(' ')
-        sequence = []
-        for this_syllable_flag in syllable_flags:
-            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
-        sequence.append(self._syllable_flag_to_id['~'])
-        return sequence
-
-    def word_segment_to_sequence(self, word_segment):
-        word_segments = word_segment.strip().split(' ')
-        sequence = []
-        for this_word_segment in word_segments:
-            sequence.append(self._word_segment_to_id[this_word_segment])
-        sequence.append(self._word_segment_to_id['~'])
-        return sequence
-
-    def emo_category_to_sequence(self, emo_type):
-        emo_categories = emo_type.strip().split(' ')
-        sequence = []
-        for this_category in emo_categories:
-            sequence.append(self._emo_category_to_id[this_category])
-        sequence.append(self._emo_category_to_id['~'])
-        return sequence
-
-    def speaker_to_sequence(self, speaker):
-        speakers = speaker.strip().split(' ')
-        sequence = []
-        for this_speaker in speakers:
-            sequence.append(self._speaker_to_id[this_speaker])
-        sequence.append(self._speaker_to_id['~'])
-        return sequence
-
-    def sequence_to_symbol(self, sequence):
-        result = ''
-        pre_lfeat_dim = 0
-        for lfeat_type in self._lfeat_type_list:
-            current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
-                                                + self._inputs_dim[lfeat_type]]
-            current_sequence = current_one_hot_sequence.argmax(1)
-            length = current_sequence.shape[0]
-
-            index = 0
-            while index < length:
-                this_sequence = current_sequence[index]
-                s = ''
-                if lfeat_type == 'sy':
-                    s = self._id_to_sy[this_sequence]
-                    if len(s) > 1 and s[0] == '@':
-                        s = s[1:]
-                elif lfeat_type == 'tone':
-                    s = self._id_to_tone[this_sequence]
-                elif lfeat_type == 'syllable_flag':
-                    s = self._id_to_syllable_flag[this_sequence]
-                elif lfeat_type == 'word_segment':
-                    s = self._id_to_word_segment[this_sequence]
-                elif lfeat_type == 'emo_category':
-                    s = self._id_to_emo_category[this_sequence]
-                elif lfeat_type == 'speaker':
-                    s = self._id_to_speaker[this_sequence]
-                else:
-                    raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-                if index == 0:
-                    result = result + lfeat_type + ': '
-
-                result = result + '{' + s + '}'
-
-                if index == length - 1:
-                    result = result + '; '
-
-                index = index + 1
-            pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
-        return result
diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py
index deaebf11..dc830db5 100644
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -1,286 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
+import pickle as pkl
 
 import json
 import numpy as np
 import torch
-from sklearn.preprocessing import MultiLabelBinarizer
 
+from modelscope.utils.audio.tts_exceptions import \
+    TtsModelConfigurationException
 from modelscope.utils.constant import ModelFile, Tasks
-from .models import Generator, create_am_model
-from .text.symbols import load_symbols
-from .text.symbols_dict import SymbolsDict
-
-import tensorflow as tf  # isort:skip
+from .models.datasets.units import KanTtsLinguisticUnit
+from .models.models.hifigan import Generator
+from .models.models.sambert import KanTtsSAMBERT
+from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint,
+                           plot_spectrogram, save_checkpoint, scan_checkpoint)
 
 MAX_WAV_VALUE = 32768.0
 
 
-def multi_label_symbol_to_sequence(my_classes, my_symbol):
-    one_hot = MultiLabelBinarizer(classes=my_classes)
-    tokens = my_symbol.strip().split(' ')
-    sequences = []
-    for token in tokens:
-        sequences.append(tuple(token.split('&')))
-    return one_hot.fit_transform(sequences)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    return checkpoint_dict
-
-
-class AttrDict(dict):
-
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
 class Voice:
 
-    def __init__(self, voice_name, voice_path, am_hparams, voc_config):
+    def __init__(self, voice_name, voice_path, am_config, voc_config):
         self.__voice_name = voice_name
         self.__voice_path = voice_path
-        self.__am_hparams = tf.contrib.training.HParams(**am_hparams)
+        self.__am_config = AttrDict(**am_config)
         self.__voc_config = AttrDict(**voc_config)
         self.__model_loaded = False
+        if 'am' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        if 'linguistic_unit' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        self.__am_lingustic_unit_config = self.__am_config['linguistic_unit']
 
     def __load_am(self):
-        local_am_ckpt_path = os.path.join(self.__voice_path,
-                                          ModelFile.TF_CHECKPOINT_FOLDER)
-        self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt')
-        self.__dict_path = os.path.join(self.__voice_path, 'dicts')
+        local_am_ckpt_path = os.path.join(self.__voice_path, 'am')
+        self.__am_ckpt_path = os.path.join(local_am_ckpt_path,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
         has_mask = True
-        if self.__am_hparams.get('has_mask') is not None:
-            has_mask = self.__am_hparams.has_mask
-        model_name = 'robutrans'
-        self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip(
-        ).split(',')
-        sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
-            self.__dict_path, has_mask)
-        self.__sy = sy
-        self.__tone = tone
-        self.__syllable_flag = syllable_flag
-        self.__word_segment = word_segment
-        self.__emo_category = emo_category
-        self.__speaker = speaker
-        self.__inputs_dim = dict()
-        for lfeat_type in self.__lfeat_type_list:
-            if lfeat_type == 'sy':
-                self.__inputs_dim[lfeat_type] = len(sy)
-            elif lfeat_type == 'tone':
-                self.__inputs_dim[lfeat_type] = len(tone)
-            elif lfeat_type == 'syllable_flag':
-                self.__inputs_dim[lfeat_type] = len(syllable_flag)
-            elif lfeat_type == 'word_segment':
-                self.__inputs_dim[lfeat_type] = len(word_segment)
-            elif lfeat_type == 'emo_category':
-                self.__inputs_dim[lfeat_type] = len(emo_category)
-            elif lfeat_type == 'speaker':
-                self.__inputs_dim[lfeat_type] = len(speaker)
-
-        self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag,
-                                          word_segment, emo_category, speaker,
-                                          self.__inputs_dim,
-                                          self.__lfeat_type_list)
-        dim_inputs = sum(self.__inputs_dim.values(
-        )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category']
-        self.__graph = tf.Graph()
-        with self.__graph.as_default():
-            inputs = tf.placeholder(tf.float32, [1, None, dim_inputs],
-                                    'inputs')
-            inputs_emotion = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['emo_category']],
-                'inputs_emotion')
-            inputs_speaker = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['speaker']],
-                'inputs_speaker')
-            input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
-            pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                  'pitch_contours_scale')
-            energy_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                   'energy_contours_scale')
-            duration_scale = tf.placeholder(tf.float32, [1, None],
-                                            'duration_scale')
-            with tf.variable_scope('model') as _:
-                self.__model = create_am_model(model_name, self.__am_hparams)
-                self.__model.initialize(
-                    inputs,
-                    inputs_emotion,
-                    inputs_speaker,
-                    input_lengths,
-                    duration_scales=duration_scale,
-                    pitch_scales=pitch_contours_scale,
-                    energy_scales=energy_contours_scale)
-                self.__mel_spec = self.__model.mel_outputs[0]
-                self.__duration_outputs = self.__model.duration_outputs[0]
-                self.__duration_outputs_ = self.__model.duration_outputs_[0]
-                self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[
-                    0]
-                self.__energy_contour_outputs = self.__model.energy_contour_outputs[
-                    0]
-                self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[
-                    0]
-                self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[
-                    0]
-                self.__encoder_outputs = self.__model.encoder_outputs[0]
-                self.__pitch_embeddings = self.__model.pitch_embeddings[0]
-                self.__energy_embeddings = self.__model.energy_embeddings[0]
-                self.__LR_outputs = self.__model.LR_outputs[0]
-                self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[
-                    0]
-                self.__attention_h = self.__model.attention_h
-                self.__attention_x = self.__model.attention_x
-
-                config = tf.ConfigProto()
-                config.gpu_options.allow_growth = True
-                self.__session = tf.Session(config=config)
-                self.__session.run(tf.global_variables_initializer())
-
-                saver = tf.train.Saver()
-                saver.restore(self.__session, self.__am_ckpt_path)
+        if 'has_mask' in self.__am_lingustic_unit_config:
+            has_mask = self.__am_lingustic_unit_config.has_mask
+        self.__ling_unit = KanTtsLinguisticUnit(
+            self.__am_lingustic_unit_config, self.__voice_path, has_mask)
+        self.__am_net = KanTtsSAMBERT(self.__am_config,
+                                      self.__ling_unit.get_unit_size()).to(
+                                          self.__device)
+        state_dict_g = {}
+        try:
+            state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device)
+        except RuntimeError:
+            with open(self.__am_ckpt_path, 'rb') as f:
+                pth_var_dict = pkl.load(f)
+                state_dict_g['fsnet'] = {
+                    k: torch.FloatTensor(v)
+                    for k, v in pth_var_dict['fsnet'].items()
+                }
+        self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False)
+        self.__am_net.eval()
 
     def __load_vocoder(self):
-        self.__voc_ckpt_path = os.path.join(self.__voice_path,
+        local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder')
+        self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path,
                                             ModelFile.TORCH_MODEL_BIN_FILE)
-        if torch.cuda.is_available():
-            torch.manual_seed(self.__voc_config.seed)
-            self.__device = torch.device('cuda')
-        else:
-            self.__device = torch.device('cpu')
         self.__generator = Generator(self.__voc_config).to(self.__device)
         state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device)
         self.__generator.load_state_dict(state_dict_g['generator'])
         self.__generator.eval()
         self.__generator.remove_weight_norm()
 
-    def __am_forward(self,
-                     text,
-                     pitch_control_str='',
-                     duration_control_str='',
-                     energy_control_str=''):
-        duration_cfg_lst = []
-        if len(duration_control_str) != 0:
-            for item in duration_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                duration_cfg_lst.append((float(percent), float(scale)))
-        pitch_contours_cfg_lst = []
-        if len(pitch_control_str) != 0:
-            for item in pitch_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                pitch_contours_cfg_lst.append((float(percent), float(scale)))
-        energy_contours_cfg_lst = []
-        if len(energy_control_str) != 0:
-            for item in energy_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                energy_contours_cfg_lst.append((float(percent), float(scale)))
-        cleaner_names = [
-            x.strip() for x in self.__am_hparams.cleaners.split(',')
-        ]
-
-        lfeat_symbol = text.strip().split(' ')
-        lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list))
-        for this_lfeat_symbol in lfeat_symbol:
-            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
-                '$')
-            if len(this_lfeat_symbol) != len(self.__lfeat_type_list):
-                raise Exception(
-                    'Length of this_lfeat_symbol in training data'
-                    + ' is not equal to the length of lfeat_type_list, '
-                    + str(len(this_lfeat_symbol)) + ' VS. '
-                    + str(len(self.__lfeat_type_list)))
-            index = 0
-            while index < len(lfeat_symbol_separate):
-                lfeat_symbol_separate[index] = lfeat_symbol_separate[
-                    index] + this_lfeat_symbol[index] + ' '
-                index = index + 1
-
-        index = 0
-        lfeat_type = self.__lfeat_type_list[index]
-        sequence = self.__symbols_dict.symbol_to_sequence(
-            lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
-        sequence_array = np.asarray(
-            sequence[:-1],
-            dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-        inputs = np.eye(
-            self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
-        index = index + 1
-        while index < len(self.__lfeat_type_list) - 2:
-            lfeat_type = self.__lfeat_type_list[index]
-            sequence = self.__symbols_dict.symbol_to_sequence(
-                lfeat_symbol_separate[index].strip(), lfeat_type,
-                cleaner_names)
-            sequence_array = np.asarray(
-                sequence[:-1],
-                dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-            inputs_temp = np.eye(
-                self.__inputs_dim[lfeat_type],
-                dtype=np.float32)[sequence_array]
-            inputs = np.concatenate((inputs, inputs_temp), axis=1)
-            index = index + 1
-        seq = inputs
-
-        lfeat_type = 'emo_category'
-        inputs_emotion = multi_label_symbol_to_sequence(
-            self.__emo_category, lfeat_symbol_separate[index].strip())
-        # inputs_emotion = inputs_emotion * 1.5
-        index = index + 1
-
-        lfeat_type = 'speaker'
-        inputs_speaker = multi_label_symbol_to_sequence(
-            self.__speaker, lfeat_symbol_separate[index].strip())
-
-        duration_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in duration_cfg_lst:
-            duration_scale[start_idx:start_idx
-                           + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in pitch_contours_cfg_lst:
-            pitch_contours_scale[start_idx:start_idx
-                                 + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in energy_contours_cfg_lst:
-            energy_contours_scale[start_idx:start_idx
-                                  + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        feed_dict = {
-            self.__model.inputs: [np.asarray(seq, dtype=np.float32)],
-            self.__model.inputs_emotion:
-            [np.asarray(inputs_emotion, dtype=np.float32)],
-            self.__model.inputs_speaker:
-            [np.asarray(inputs_speaker, dtype=np.float32)],
-            self.__model.input_lengths:
-            np.asarray([len(seq)], dtype=np.int32),
-            self.__model.duration_scales: [duration_scale],
-            self.__model.pitch_scales: [pitch_contours_scale],
-            self.__model.energy_scales: [energy_contours_scale]
-        }
-
-        result = self.__session.run([
-            self.__mel_spec, self.__duration_outputs, self.__duration_outputs_,
-            self.__pitch_contour_outputs, self.__embedded_inputs_emotion,
-            self.__embedding_fsmn_outputs, self.__encoder_outputs,
-            self.__pitch_embeddings, self.__LR_outputs,
-            self.__postnet_fsmn_outputs, self.__energy_contour_outputs,
-            self.__energy_embeddings, self.__attention_x, self.__attention_h
-        ], feed_dict=feed_dict)  # yapf:disable
-        return result[0]
+    def __am_forward(self, symbol_seq):
+        with torch.no_grad():
+            inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
+                symbol_seq)
+            inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
+                self.__device)
+            inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
+                self.__device)
+            inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to(
+                self.__device)
+            inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
+                self.__device)
+            inputs_ling = torch.stack(
+                [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                dim=-1).unsqueeze(0)
+            inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_len = torch.zeros(1).to(self.__device).long(
+            ) + inputs_emo.size(1) - 1  # minus 1 for "~"
+            res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
+                                inputs_spk[:, :-1], inputs_len)
+            postnet_outputs = res['postnet_outputs']
+            LR_length_rounded = res['LR_length_rounded']
+            valid_length = int(LR_length_rounded[0].item())
+            postnet_outputs = postnet_outputs[
+                0, :valid_length, :].cpu().numpy()
+            return postnet_outputs
 
     def __vocoder_forward(self, melspec):
         dim0 = list(melspec.shape)[-1]
         if dim0 != self.__voc_config.num_mels:
             raise TtsVocoderMelspecShapeMismatchException(
-                'input melspec mismatch require {} but {}'.format(
-                    self.__voc_config.num_mels, dim0))
+                'modelscope error: input melspec mismatch require {} but {}'.
+                format(self.__voc_config.num_mels, dim0))
         with torch.no_grad():
             x = melspec.T
             x = torch.FloatTensor(x).to(self.__device)
@@ -292,9 +117,15 @@ class Voice:
             audio = audio.cpu().numpy().astype('int16')
             return audio
 
-    def forward(self, text):
+    def forward(self, symbol_seq):
         if not self.__model_loaded:
+            torch.manual_seed(self.__am_config.seed)
+            if torch.cuda.is_available():
+                torch.manual_seed(self.__am_config.seed)
+                self.__device = torch.device('cuda')
+            else:
+                self.__device = torch.device('cpu')
             self.__load_am()
             self.__load_vocoder()
             self.__model_loaded = True
-        return self.__vocoder_forward(self.__am_forward(text))
+        return self.__vocoder_forward(self.__am_forward(symbol_seq))
diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py
index f9e7d80a..2063da68 100644
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, List
 
 import numpy as np
@@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline):
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         return inputs
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
diff --git a/modelscope/utils/audio/tts_exceptions.py b/modelscope/utils/audio/tts_exceptions.py
index 8c73b603..43ec994b 100644
--- a/modelscope/utils/audio/tts_exceptions.py
+++ b/modelscope/utils/audio/tts_exceptions.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 """
 Define TTS exceptions
 """
@@ -10,7 +11,7 @@ class TtsException(Exception):
     pass
 
 
-class TtsModelConfigurationExcetion(TtsException):
+class TtsModelConfigurationException(TtsException):
     """
     TTS model configuration exceptions.
     """
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 5e4bc104..d22ad8f1 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,5 @@
 easyasr>=0.0.2
 espnet>=202204
-#tts
 h5py
 inflect
 keras
@@ -15,11 +14,7 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-ptflops
 py_sound_connect
-pytorch_wavelets
-PyWavelets>=1.0.0
-scikit-learn
 SoundFile>0.10
 sox
 torchaudio
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index e82cf43e..f659e59b 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -9,6 +9,7 @@ import unittest
 import torch
 from scipy.io.wavfile import write
 
+from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -33,7 +34,9 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
         text = '今天北京天气怎么样？'
         voice = 'zhitian_emo'
 
-        sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id)
+        model = Model.from_pretrained(
+            model_name_or_path=self.model_id, revision='pytorch_am')
+        sambert_hifigan_tts = pipeline(task=self.task, model=model)
         self.assertTrue(sambert_hifigan_tts is not None)
         output = sambert_hifigan_tts(input=text, voice=voice)
         self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])

From b98114367bb8f3e383cb101d329cc85481264ee3 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:15:24 +0800
Subject: [PATCH 149/175] [to #42322933]add timestamp for movie scene
 segmentation output         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10269467

    * add timestamp for movie scene segmentation output
---
 .../models/audio/tts/models/datasets/__init__.py     |  0
 .../cv/movie_scene_segmentation/utils/save_op.py     | 12 ++++++++----
 modelscope/outputs.py                                |  3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100644
new mode 100755
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index 6361c056..b350ff13 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -26,7 +26,8 @@ def pred2scene(shot2keyf, anno_dict):
     for scene_ind, scene_item in enumerate(scene_list):
         scene_dict_lst.append({
             'shot': pair_list[scene_ind],
-            'frame': scene_item
+            'frame': scene_item[0],
+            'timestamp': scene_item[1]
         })
 
     return scene_dict_lst, scene_list
@@ -42,8 +43,8 @@ def scene2video(source_movie_fn, scene_list, thres):
 
     for scene_ind, scene_item in tqdm(enumerate(scene_list)):
         scene = str(scene_ind).zfill(4)
-        start_frame = int(scene_item[0])
-        end_frame = int(scene_item[1])
+        start_frame = int(scene_item[0][0])
+        end_frame = int(scene_item[0][1])
         start_time, end_time = start_frame / fps, end_frame / fps
         duration_time = end_time - start_time
         out_video_fn = os.path.join(out_video_dir_fn,
@@ -71,7 +72,10 @@ def get_demo_scene_list(shot2keyf, anno_dict):
         start_shot, end_shot = int(pair[0]), int(pair[-1])
         start_frame = shot2keyf[start_shot].split(' ')[0]
         end_frame = shot2keyf[end_shot].split(' ')[1]
-        scene_list.append((start_frame, end_frame))
+        start_timestamp = shot2keyf[start_shot].split(' ')[-2]
+        end_timestamp = shot2keyf[end_shot].split(' ')[-1]
+        scene_list.append([[start_frame, end_frame],
+                           [start_timestamp, end_timestamp]])
     return scene_list, pair_list
 
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b19f7e43..d80ba9c5 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -303,7 +303,8 @@ TASK_OUTPUTS = {
     #        [
     #           {
     #               "shot": [0,1,2],
-    #               "frame": [start_frame, end_frame]
+    #               "frame": [start_frame, end_frame],
+    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
     #           }
     #        ]
     #

From 939a9f232242684dc86f463ac294c14beaa99f3e Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:17:41 +0800
Subject: [PATCH 150/175] [to #42322933]fix commits         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10272768

---
 modelscope/outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index d80ba9c5..92e3410b 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -153,7 +153,7 @@ TASK_OUTPUTS = {
 
     # semantic segmentation result for single sample
     #   {
-    #       "masks": [np.array # 2D array containing only 0, 255]
+    #       "masks": [np.array # 2D array with shape [height, width]]
     #   }
     Tasks.semantic_segmentation: [OutputKeys.MASKS],
 

From 744c84c89302728d0d6bfaca411d00abdee5b310 Mon Sep 17 00:00:00 2001
From: "lanjinpeng.ljp" <lanjinpeng.ljp@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:19:14 +0800
Subject: [PATCH 151/175] output timestamps for video-single-object-tracking
 demo service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

830版本 video-single-object-tracking demo需要输出timestamps信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10278969
---
 .../cv/video_single_object_tracking/utils/utils.py    |  7 +++++++
 modelscope/outputs.py                                 |  6 ++++--
 .../cv/video_single_object_tracking_pipeline.py       | 11 +++++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 752ec272..90513a2a 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -238,3 +238,10 @@ def check_box(box: list, image_height, image_width) -> bool:
     if box[3] < 0 or box[3] >= image_height:
         return False
     return True
+
+
+def timestamp_format(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 92e3410b..b96f38d3 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -247,9 +247,11 @@ TASK_OUTPUTS = {
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
-    #             ]
+    #             ],
+    #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     # }
-    Tasks.video_single_object_tracking: [OutputKeys.BOXES],
+    Tasks.video_single_object_tracking:
+    [OutputKeys.BOXES, OutputKeys.TIMESTAMPS],
 
     # live category recognition result for single video
     # {
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
index c47fc15f..4169def7 100644
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -9,8 +9,8 @@ from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
     cfg
 from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
     OSTrack
-from modelscope.models.cv.video_single_object_tracking.utils.utils import \
-    check_box
+from modelscope.models.cv.video_single_object_tracking.utils.utils import (
+    check_box, timestamp_format)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -45,7 +45,10 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
 
     def forward(self, input: Input) -> Dict[str, Any]:
         output_boxes = []
+        output_timestamps = []
         cap = cv2.VideoCapture(self.video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_idx = 0
         success, frame = cap.read()
         if success is False:
             raise Exception(
@@ -58,6 +61,7 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             raise Exception('modelscope error: init_box out of image range ',
                             init_box)
         output_boxes.append(init_box.copy())
+        output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         init_box[2] = init_box[2] - init_box[0]
         init_box[3] = init_box[3] - init_box[1]
         self.tracker.initialize(frame, {'init_bbox': init_box})
@@ -67,14 +71,17 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             ret, frame = cap.read()
             if frame is None:
                 break
+            frame_idx += 1
             out = self.tracker.track(frame)
             state = [int(s) for s in out['target_bbox']]
             output_boxes.append(state)
+            output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         cap.release()
         logger.info('tracking process done')
 
         return {
             OutputKeys.BOXES: output_boxes,
+            OutputKeys.TIMESTAMPS: output_timestamps
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

From 357a233ee32bbaec7eaef58f383d86219b3f9cd3 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:03:00 +0800
Subject: [PATCH 152/175] [to #42322933] fix bug: checkpoint hook and
 bestckpthook exists at the same time         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10227608

---
 modelscope/trainers/default_config.py        | 19 +++++++++++++++++++
 modelscope/trainers/trainer.py               |  7 ++-----
 tests/trainers/hooks/test_checkpoint_hook.py |  3 ---
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index 69fdd400..c8f0c7b0 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -1,4 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import Config
+
 DEFAULT_CONFIG = {
     'train': {
         'hooks': [{
@@ -12,3 +15,19 @@ DEFAULT_CONFIG = {
         }]
     }
 }
+
+
+def merge_cfg(cfg: Config):
+    """Merge the default config into the input cfg.
+
+    This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg.
+
+    @param cfg: The input cfg to be merged into.
+    """
+    cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
+    # pop duplicate hook
+
+    if any(['BestCkptSaverHook' == hook['type'] for hook in cfg.train.hooks]):
+        cfg.train.hooks = list(
+            filter(lambda hook: hook['type'] != 'CheckpointHook',
+                   cfg.train.hooks))
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d3675720..a01d9b59 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -41,7 +41,7 @@ from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                           init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
-from .default_config import DEFAULT_CONFIG
+from .default_config import merge_cfg
 from .hooks.hook import Hook
 from .parallel.builder import build_parallel
 from .parallel.utils import is_parallel
@@ -114,7 +114,7 @@ class EpochBasedTrainer(BaseTrainer):
         super().__init__(cfg_file, arg_parse_fn)
 
         # add default config
-        self.cfg.merge_from_dict(self._get_default_config(), force=False)
+        merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
 
         if 'cfg_options' in kwargs:
@@ -951,9 +951,6 @@ class EpochBasedTrainer(BaseTrainer):
                 stage_hook_infos.append(info)
         return '\n'.join(stage_hook_infos)
 
-    def _get_default_config(self):
-        return DEFAULT_CONFIG
-
 
 def worker_init_fn(worker_id, num_workers, rank, seed):
     # The seed of each worker equals to
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index c694ece6..e7f2d33c 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -204,9 +204,6 @@ class BestCkptSaverHookTest(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
         self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth',
                       results_files)
 

From 372adb3936939c0079924cd8a761e525b4fbd77f Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:04:38 +0800
Subject: [PATCH 153/175] [to #42322933] support hand-static model        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244616

---
 data/test/images/hand_static.jpg              |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/hand_static/__init__.py  |  20 +
 .../models/cv/hand_static/hand_model.py       |  93 +++++
 modelscope/models/cv/hand_static/networks.py  | 358 ++++++++++++++++++
 modelscope/outputs.py                         |   6 +-
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   4 +-
 .../pipelines/cv/hand_static_pipeline.py      |  37 ++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_hand_static.py           |  32 ++
 11 files changed, 556 insertions(+), 2 deletions(-)
 create mode 100644 data/test/images/hand_static.jpg
 create mode 100644 modelscope/models/cv/hand_static/__init__.py
 create mode 100644 modelscope/models/cv/hand_static/hand_model.py
 create mode 100644 modelscope/models/cv/hand_static/networks.py
 create mode 100644 modelscope/pipelines/cv/hand_static_pipeline.py
 create mode 100644 tests/pipelines/test_hand_static.py

diff --git a/data/test/images/hand_static.jpg b/data/test/images/hand_static.jpg
new file mode 100644
index 00000000..43ae28b1
--- /dev/null
+++ b/data/test/images/hand_static.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94b8e281d77ee6d3ea2a8a0c9408ecdbd29fe75f33ea5399b6ea00070ba77bd6
+size 13090
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 29a35fbe..5870ebe3 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -39,6 +39,7 @@ class Models(object):
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
     video_inpainting = 'video-inpainting'
+    hand_static = 'hand-static'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -173,6 +174,7 @@ class Pipelines(object):
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
     video_inpainting = 'video-inpainting'
+    hand_static = 'hand-static'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/hand_static/__init__.py b/modelscope/models/cv/hand_static/__init__.py
new file mode 100644
index 00000000..654d2acb
--- /dev/null
+++ b/modelscope/models/cv/hand_static/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_model import HandStatic
+
+else:
+    _import_structure = {'hand_model': ['HandStatic']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py
new file mode 100644
index 00000000..38517307
--- /dev/null
+++ b/modelscope/models/cv/hand_static/hand_model.py
@@ -0,0 +1,93 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+from torchvision.transforms import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .networks import StaticGestureNet
+
+logger = get_logger()
+
+map_idx = {
+    0: 'unrecog',
+    1: 'one',
+    2: 'two',
+    3: 'bixin',
+    4: 'yaogun',
+    5: 'zan',
+    6: 'fist',
+    7: 'ok',
+    8: 'tuoju',
+    9: 'd_bixin',
+    10: 'd_fist_left',
+    11: 'd_fist_right',
+    12: 'd_hand',
+    13: 'fashe',
+    14: 'five',
+    15: 'nohand'
+}
+
+img_size = [112, 112]
+
+spatial_transform = transforms.Compose([
+    transforms.Resize(img_size),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+])
+
+
+@MODELS.register_module(Tasks.hand_static, module_name=Models.hand_static)
+class HandStatic(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = StaticGestureNet()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=self.device)
+
+        self.model.load_state_dict(self.params)
+        self.model.to(self.device)
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and self.device == 'cuda':
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+    def forward(self, x):
+        pred_result = self.model(x)
+        return pred_result
+
+
+def infer(img_path, model, device):
+
+    img = Image.open(img_path)
+    clip = spatial_transform(img)
+    clip = clip.unsqueeze(0).to(device).float()
+    outputs = model(clip)
+    predicted = int(outputs.max(1)[1])
+    pred_result = map_idx.get(predicted)
+    logger.info('pred result: {}'.format(pred_result))
+
+    return pred_result
diff --git a/modelscope/models/cv/hand_static/networks.py b/modelscope/models/cv/hand_static/networks.py
new file mode 100644
index 00000000..6cf46f5d
--- /dev/null
+++ b/modelscope/models/cv/hand_static/networks.py
@@ -0,0 +1,358 @@
+""" HandStatic
+The implementation here is modified based on MobileFaceNet,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/xuexingyu24/MobileFaceNet_Tutorial_Pytorch
+"""
+
+import os
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.models as models
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d,
+                      Dropout, Linear, MaxPool2d, Module, PReLU, ReLU,
+                      Sequential, Sigmoid)
+
+
+class StaticGestureNet(torch.nn.Module):
+
+    def __init__(self, train=True):
+        super().__init__()
+
+        model = MobileFaceNet(512)
+        self.feature_extractor = model
+        self.fc_layer = torch.nn.Sequential(
+            nn.Linear(512, 128), nn.Softplus(), nn.Linear(128, 15))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, inputs):
+        out = self.feature_extractor(inputs)
+        out = self.fc_layer(out)
+        out = self.sigmoid(out)
+        return out
+
+
+class Flatten(Module):
+
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+    return output
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class BottleneckIR(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+class BottleneckIRSE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIRSE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    return blocks
+
+
+class Backbone(Module):
+
+    def __init__(self, num_layers, drop_ratio, mode='ir'):
+        super(Backbone, self).__init__()
+        assert num_layers in [50, 100,
+                              152], 'num_layers should be 50,100, or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = BottleneckIR
+        elif mode == 'ir_se':
+            unit_module = BottleneckIRSE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        self.output_layer = Sequential(
+            BatchNorm2d(512), Dropout(drop_ratio), Flatten(),
+            Linear(512 * 7 * 7, 512), BatchNorm1d(512))
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+        return l2_norm(x)
+
+
+class ConvBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(ConvBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+
+
+class LinearBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(LinearBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class DepthWise(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 residual=False,
+                 kernel=(3, 3),
+                 stride=(2, 2),
+                 padding=(1, 1),
+                 groups=1):
+        super(DepthWise, self).__init__()
+        self.conv = ConvBlock(
+            in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = ConvBlock(
+            groups,
+            groups,
+            groups=groups,
+            kernel=kernel,
+            padding=padding,
+            stride=stride)
+        self.project = LinearBlock(
+            groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+
+    def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+
+
+class Residual(Module):
+
+    def __init__(self,
+                 c,
+                 num_block,
+                 groups,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(
+                DepthWise(
+                    c,
+                    c,
+                    residual=True,
+                    kernel=kernel,
+                    padding=padding,
+                    stride=stride,
+                    groups=groups))
+        self.model = Sequential(*modules)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class MobileFaceNet(Module):
+
+    def __init__(self, embedding_size):
+        super(MobileFaceNet, self).__init__()
+        self.conv1 = ConvBlock(
+            3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        self.conv2_dw = ConvBlock(
+            64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+        self.conv_23 = DepthWise(
+            64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+        self.conv_3 = Residual(
+            64,
+            num_block=4,
+            groups=128,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_34 = DepthWise(
+            64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+        self.conv_4 = Residual(
+            128,
+            num_block=6,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_45 = DepthWise(
+            128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+        self.conv_5 = Residual(
+            128,
+            num_block=2,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_6_sep = ConvBlock(
+            128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.conv_6_dw = LinearBlock(
+            512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(512, embedding_size, bias=False)
+        self.bn = BatchNorm1d(embedding_size)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2_dw(out)
+        out = self.conv_23(out)
+        out = self.conv_3(out)
+        out = self.conv_34(out)
+        out = self.conv_4(out)
+        out = self.conv_45(out)
+        out = self.conv_5(out)
+        out = self.conv_6_sep(out)
+        out = self.conv_6_dw(out)
+        out = self.conv_6_flatten(out)
+        out = self.linear(out)
+        return l2_norm(out)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b96f38d3..ce9e8d07 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -632,5 +632,9 @@ TASK_OUTPUTS = {
     # {
     #     'output': ['Done' / 'Decode_Error']
     # }
-    Tasks.video_inpainting: [OutputKeys.OUTPUT]
+    Tasks.video_inpainting: [OutputKeys.OUTPUT],
+    # {
+    #     'output': ['bixin']
+    # }
+    Tasks.hand_static: [OutputKeys.OUTPUT]
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 5e244b27..51d50d51 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -178,6 +178,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                               'damo/cv_vitb16_segmentation_shop-seg'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
+    Tasks.hand_static: (Pipelines.hand_static,
+                        'damo/cv_mobileface_hand-static'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index a9dc05f2..55bad09a 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -52,7 +52,8 @@ if TYPE_CHECKING:
     from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
-    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipeline
+    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
+    from .hand_static_pipeline import HandStaticPipeline
 
 else:
     _import_structure = {
@@ -119,6 +120,7 @@ else:
         'facial_expression_recognition_pipelin':
         ['FacialExpressionRecognitionPipeline'],
         'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
+        'hand_static_pipeline': ['HandStaticPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py
new file mode 100644
index 00000000..1219c873
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_static_pipeline.py
@@ -0,0 +1,37 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.hand_static import hand_model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.hand_static, module_name=Pipelines.hand_static)
+class HandStaticPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create hand static pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = hand_model.infer(input['img_path'], self.model, self.device)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index de3d933f..75add1d9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -42,6 +42,7 @@ class CVTasks(object):
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
+    hand_static = 'hand-static'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_hand_static.py b/tests/pipelines/test_hand_static.py
new file mode 100644
index 00000000..37181899
--- /dev/null
+++ b/tests/pipelines/test_hand_static.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HandStaticTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_mobileface_hand-static'
+        self.input = {'img_path': 'data/test/images/hand_static.jpg'}
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        hand_static = pipeline(Tasks.hand_static, model=self.model)
+        self.pipeline_inference(hand_static, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        hand_static = pipeline(Tasks.hand_static)
+        self.pipeline_inference(hand_static, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()

From d721fabb343c9bfe8721464dee5d4dd30d634e26 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:08:33 +0800
Subject: [PATCH 154/175] [to #42322933]bert with sequence classification /
 token classification/ fill mask refactor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.新增支持原始bert模型（非easynlp的 backbone prefix版本）
2.支持bert的在sequence classification/fill mask /token classification上的backbone head形式
3.统一了sequence classification几个任务的pipeline到一个类
4.fill mask 支持backbone head形式
5.token classification的几个子任务（ner，word seg， part of speech）的preprocessor 统一到了一起TokenClassificationPreprocessor
6. sequence classification的几个子任务（single classification， pair classification）的preprocessor 统一到了一起SequenceClassificationPreprocessor
7. 改动register中 cls的group_key 赋值位置，之前的group_key在多个decorators的情况下，会被覆盖，obj_cls的group_key信息不正确
8. 基于backbone head形式将 原本group_key和 module同名的情况尝试做调整，如下在modelscope/pipelines/nlp/sequence_classification_pipeline.py 中
原本
 @PIPELINES.register_module(
    Tasks.sentiment_classification, module_name=Pipelines.sentiment_classification)
改成
@PIPELINES.register_module(
    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
相应的configuration.json也有改动，这样的改动更符合任务和pipline（子任务）的关系。
8. 其他相应改动为支持上述功能
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10041463
---
 modelscope/metainfo.py                        |   11 +-
 modelscope/models/builder.py                  |    9 +-
 modelscope/models/nlp/__init__.py             |   22 +-
 modelscope/models/nlp/backbones/bert.py       |    7 +
 modelscope/models/nlp/bert/__init__.py        |   60 +
 .../models/nlp/bert/configuration_bert.py     |  162 ++
 modelscope/models/nlp/bert/modeling_bert.py   | 2040 +++++++++++++++++
 .../nlp/bert_for_sequence_classification.py   |   70 -
 modelscope/models/nlp/deberta_v2/__init__.py  |   10 -
 modelscope/models/nlp/heads/fill_mask_head.py |  101 +
 .../models/nlp/heads/torch_pretrain_head.py   |    2 +-
 modelscope/models/nlp/masked_language.py      |    5 +-
 .../nlp/nncrf_for_named_entity_recognition.py |    9 +-
 .../models/nlp/sequence_classification.py     |   83 +-
 modelscope/models/nlp/task_models/__init__.py |    4 +
 .../nlp/task_models/feature_extraction.py     |   43 +
 .../models/nlp/task_models/fill_mask.py       |   47 +
 .../nlp/task_models/information_extraction.py |   15 +-
 .../task_models/sequence_classification.py    |   49 +-
 .../models/nlp/task_models/task_model.py      |   29 +-
 .../nlp/task_models/token_classification.py   |   15 +-
 modelscope/models/nlp/token_classification.py |   49 +-
 modelscope/outputs.py                         |   16 +
 modelscope/pipelines/builder.py               |    7 +-
 modelscope/pipelines/nlp/__init__.py          |   19 +-
 .../nlp/feature_extraction_pipeline.py        |   82 +
 .../pipelines/nlp/fill_mask_pipeline.py       |    9 +-
 .../nlp/information_extraction_pipeline.py    |    2 +-
 .../nlp/named_entity_recognition_pipeline.py  |    5 +-
 .../pair_sentence_classification_pipeline.py  |   59 -
 .../nlp/sequence_classification_pipeline.py   |   72 +-
 .../sequence_classification_pipeline_base.py  |   62 -
 ...single_sentence_classification_pipeline.py |   56 -
 .../nlp/token_classification_pipeline.py      |    2 +-
 modelscope/preprocessors/__init__.py          |   48 +-
 modelscope/preprocessors/nlp/__init__.py      |   45 +-
 modelscope/preprocessors/nlp/nlp_base.py      |  575 ++---
 modelscope/utils/constant.py                  |    1 +
 modelscope/utils/registry.py                  |    2 +-
 tests/msdatasets/test_ms_dataset.py           |    3 +-
 tests/pipelines/test_deberta_tasks.py         |    8 +-
 tests/pipelines/test_feature_extraction.py    |   67 +
 tests/pipelines/test_fill_mask.py             |   49 +-
 .../test_named_entity_recognition.py          |   10 +-
 tests/pipelines/test_nli.py                   |   10 +-
 tests/pipelines/test_sentence_similarity.py   |   10 +-
 .../test_sentiment_classification.py          |   31 +-
 tests/pipelines/test_text_classification.py   |    4 +-
 tests/preprocessors/test_nlp.py               |   76 +
 tests/utils/test_ast.py                       |   12 +-
 50 files changed, 3347 insertions(+), 837 deletions(-)
 create mode 100644 modelscope/models/nlp/backbones/bert.py
 create mode 100644 modelscope/models/nlp/bert/__init__.py
 create mode 100644 modelscope/models/nlp/bert/configuration_bert.py
 create mode 100755 modelscope/models/nlp/bert/modeling_bert.py
 delete mode 100644 modelscope/models/nlp/bert_for_sequence_classification.py
 create mode 100644 modelscope/models/nlp/heads/fill_mask_head.py
 create mode 100644 modelscope/models/nlp/task_models/feature_extraction.py
 create mode 100644 modelscope/models/nlp/task_models/fill_mask.py
 create mode 100644 modelscope/pipelines/nlp/feature_extraction_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
 delete mode 100644 modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
 create mode 100644 tests/pipelines/test_feature_extraction.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 5870ebe3..a1cf5e06 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -91,17 +91,22 @@ class TaskModels(object):
     text_classification = 'text-classification'
     token_classification = 'token-classification'
     information_extraction = 'information-extraction'
+    fill_mask = 'fill-mask'
+    feature_extraction = 'feature-extraction'
 
 
 class Heads(object):
     # nlp heads
+
+    # text cls
     text_classification = 'text-classification'
-    # mlm
+    # fill mask
+    fill_mask = 'fill-mask'
     bert_mlm = 'bert-mlm'
-    # roberta mlm
     roberta_mlm = 'roberta-mlm'
     # token cls
     token_classification = 'token-classification'
+    # extraction
     information_extraction = 'information-extraction'
 
 
@@ -203,6 +208,7 @@ class Pipelines(object):
     passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -306,6 +312,7 @@ class Preprocessors(object):
     table_question_answering_preprocessor = 'table-question-answering-preprocessor'
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 33f111a8..7a8e28f4 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -37,13 +37,16 @@ def build_backbone(cfg: ConfigDict,
         cfg, BACKBONES, group_key=field, default_args=default_args)
 
 
-def build_head(cfg: ConfigDict, default_args: dict = None):
+def build_head(cfg: ConfigDict,
+               group_key: str = None,
+               default_args: dict = None):
     """ build head given config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for head object.
         default_args (dict, optional): Default initialization arguments.
     """
-
+    if group_key is None:
+        group_key = cfg[TYPE_NAME]
     return build_from_cfg(
-        cfg, HEADS, group_key=cfg[TYPE_NAME], default_args=default_args)
+        cfg, HEADS, group_key=group_key, default_args=default_args)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 152a32dc..8ef96365 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -6,7 +6,6 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .backbones import SbertModel
     from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
     from .heads import SequenceClassificationHead
@@ -20,12 +19,15 @@ if TYPE_CHECKING:
     from .palm_v2 import PalmForTextGeneration
     from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
     from .star_text_to_sql import StarForTextToSql
-    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
+    from .sequence_classification import (VecoForSequenceClassification,
+                                          SbertForSequenceClassification,
+                                          BertForSequenceClassification)
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
     from .table_question_answering import TableQuestionAnswering
-    from .task_models import (InformationExtractionModel,
+    from .task_models import (FeatureExtractionModel,
+                              InformationExtractionModel,
                               SequenceClassificationModel,
                               SingleBackboneTaskModelBase,
                               TokenClassificationModel)
@@ -37,7 +39,6 @@ else:
     _import_structure = {
         'backbones': ['SbertModel'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'heads': ['SequenceClassificationHead'],
@@ -54,15 +55,20 @@ else:
         'palm_v2': ['PalmForTextGeneration'],
         'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
         'star_text_to_sql': ['StarForTextToSql'],
-        'sequence_classification':
-        ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
+        'sequence_classification': [
+            'VecoForSequenceClassification', 'SbertForSequenceClassification',
+            'BertForSequenceClassification'
+        ],
         'space': [
             'SpaceForDialogIntent', 'SpaceForDialogModeling',
             'SpaceForDialogStateTracking'
         ],
         'task_models': [
-            'InformationExtractionModel', 'SequenceClassificationModel',
-            'SingleBackboneTaskModelBase', 'TokenClassificationModel'
+            'FeatureExtractionModel',
+            'InformationExtractionModel',
+            'SequenceClassificationModel',
+            'SingleBackboneTaskModelBase',
+            'TokenClassificationModel',
         ],
         'token_classification': ['SbertForTokenClassification'],
         'table_question_answering': ['TableQuestionAnswering'],
diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py
new file mode 100644
index 00000000..aa513944
--- /dev/null
+++ b/modelscope/models/nlp/backbones/bert.py
@@ -0,0 +1,7 @@
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.models.nlp.bert import BertModel
+from modelscope.utils.constant import Fields
+
+BACKBONES.register_module(
+    group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
new file mode 100644
index 00000000..705d9519
--- /dev/null
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .modeling_bert import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertLayer,
+        BertLMHeadModel,
+        BertModel,
+        BertPreTrainedModel,
+        load_tf_weights_in_bert,
+    )
+
+    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
+    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+    from .tokenization_bert_fast import BertTokenizerFast
+
+else:
+    _import_structure = {
+        'configuration_bert':
+        ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'],
+        'tokenization_bert':
+        ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'],
+    }
+    _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast']
+
+    _import_structure['modeling_bert'] = [
+        'BERT_PRETRAINED_MODEL_ARCHIVE_LIST',
+        'BertForMaskedLM',
+        'BertForMultipleChoice',
+        'BertForNextSentencePrediction',
+        'BertForPreTraining',
+        'BertForQuestionAnswering',
+        'BertForSequenceClassification',
+        'BertForTokenClassification',
+        'BertLayer',
+        'BertLMHeadModel',
+        'BertModel',
+        'BertPreTrainedModel',
+        'load_tf_weights_in_bert',
+    ]
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration_bert.py
new file mode 100644
index 00000000..2c9293ec
--- /dev/null
+++ b/modelscope/models/nlp/bert/configuration_bert.py
@@ -0,0 +1,162 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`BertModel`] or a [`TFBertModel`]. It is used to instantiate a BERT model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different
+            tokens that can be represented by the `inputs_ids` passed when
+            calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward)
+            layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and
+            `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or
+            1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`,
+            `"relative_key"`, `"relative_key_query"`. For positional embeddings
+            use `"absolute"`. For more information on `"relative_key"`, please
+            refer to [Self-Attention with Relative Position Representations
+            (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more
+            information on `"relative_key_query"`, please refer to *Method 4* in
+            [Improve Transformer Models with Better Relative Position Embeddings
+            (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python >>> from transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'bert'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class BertOnnxConfig(OnnxConfig):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([
+            ('input_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('attention_mask', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('token_type_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+        ])
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
new file mode 100755
index 00000000..f8fd5994
--- /dev/null
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -0,0 +1,2040 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.models.base import TorchModel
+from modelscope.utils.logger import get_logger
+from .configuration_bert import BertConfig
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(
+                    f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+                )
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
+            # cross attention key/value_states. Further calls to cross_attention
+            # layer can then reuse all cross-attention key/value_states (first
+            # "if" case) if uni-directional self-attention (decoder) save
+            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
+            # key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected
+            # key/value_states (third "elif" case) if encoder bi-directional
+            # self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = BertAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
+                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided,
+        `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the
+            next sequence prediction (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size,
+        2)`):
+            Prediction scores of the next sequence prediction (classification)
+            head (scores of True/False continuation before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings +
+            one for the output of each layer) of shape `(batch_size,
+            sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
+        config = BertConfig(**config)
+        model = cls(config, add_pooling_layer)
+        return model
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked
+    language modeling` head and a `next sentence prediction (classification)`
+    head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the masked language modeling loss. Indices
+                should be in `[-100, 0, ..., config.vocab_size]` (see
+                `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with
+                labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`,
+            *optional*):
+                Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see
+                `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    BERT_START_DOCSTRING)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the
+                encoder. Used in the cross-attention if the model is configured
+                as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices
+                of the encoder input. This mask is used in the cross-attention
+                if the model is configured as a decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the left-to-right language modeling loss
+                (next word prediction). Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with
+                indices set to `-100` are ignored (masked), the loss is only
+                computed for the tokens with labels n `[0, ...,
+                config.vocab_size]`
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+            `config.n_layers` with each tuple having 4 tensors of shape
+            `(batch_size, num_heads, sequence_length - 1,
+            embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the
+                attention blocks. Can be used to speed up decoding.
+
+                If `past_key_values` are used, the user can optionally input
+                only the last `decoder_input_ids` (those that don't have their
+                past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+                `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are
+                returned and can be used to speed up decoding (see
+                `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertLMHeadModel,
+        BertConfig >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification)
+            loss. Input should be a sequence pair (see `input_ids` docstring).
+            Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer,
+        BertForNextSentencePrediction >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        ```
+        """
+
+        if 'next_sentence_label' in kwargs:
+            warnings.warn(
+                'The `next_sentence_label` argument is deprecated, use `labels` instead.',
+                FutureWarning,
+            )
+            labels = kwargs.pop('next_sentence_label')
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`. If
+            `config.num_labels == 1` a regression loss is computed (Mean-Square
+            loss), If `config.num_labels > 1` a classification loss is computed
+            (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer
+    on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format(
+            'batch_size, num_choices, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in `[0, ..., num_choices-1]` where `num_choices`
+            is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the token classification loss. Indices should
+            be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive
+    question-answering tasks like SQuAD (a linear layers on top of the
+    hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`,
+        *optional*):
+            Labels for position (index) of the start of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
deleted file mode 100644
index 2b1a3b3b..00000000
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['BertForSequenceClassification']
-
-
-@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
-class BertForSequenceClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs)
-        # Predictor.__init__(self, *args, **kwargs)
-        """initialize the sequence classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-        import torch
-        from easynlp.appzoo import SequenceClassification
-        from easynlp.core.predictor import get_model_predictor
-        self.model = get_model_predictor(
-            model_dir=self.model_dir,
-            model_cls=SequenceClassification,
-            input_keys=[('input_ids', torch.LongTensor),
-                        ('attention_mask', torch.LongTensor),
-                        ('token_type_ids', torch.LongTensor)],
-            output_keys=['predictions', 'probabilities', 'logits'])
-
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        return self.model.predict(input)
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        # N x num_classes
-        probs = inputs['probabilities']
-        result = {
-            'probs': probs,
-        }
-
-        return result
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
index 664fc6c6..830210ed 100644
--- a/modelscope/models/nlp/deberta_v2/__init__.py
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -21,21 +21,12 @@ from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
-_import_structure = {
-    'configuration_deberta_v2': [
-        'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
-        'DebertaV2OnnxConfig'
-    ],
-    'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
-}
-
 if TYPE_CHECKING:
     from .configuration_deberta_v2 import DebertaV2Config
     from .tokenization_deberta_v2 import DebertaV2Tokenizer
     from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
 
     from .modeling_deberta_v2 import (
-        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
         DebertaV2ForMaskedLM,
         DebertaV2ForMultipleChoice,
         DebertaV2ForQuestionAnswering,
@@ -55,7 +46,6 @@ else:
         'DebertaV2TokenizerFast'
     ]
     _import_structure['modeling_deberta_v2'] = [
-        'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
         'DebertaV2ForMaskedLM',
         'DebertaV2ForMultipleChoice',
         'DebertaV2ForQuestionAnswering',
diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py
new file mode 100644
index 00000000..6b0c5e05
--- /dev/null
+++ b/modelscope/models/nlp/heads/fill_mask_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+class BertFillMaskHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = BertOnlyMLMHead(self.config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.cls(sequence_output)
+        return {OutputKeys.LOGITS: prediction_scores}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        loss_fct = CrossEntropyLoss()  # -100 index = padding token
+        masked_lm_loss = loss_fct(
+            outputs.view(-1, self.config.vocab_size), labels.view(-1))
+        return {OutputKeys.LOSS: masked_lm_loss}
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
index fb54637b..e477533f 100644
--- a/modelscope/models/nlp/heads/torch_pretrain_head.py
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -11,7 +11,7 @@ from modelscope.models.builder import HEADS
 from modelscope.utils.constant import Tasks
 
 
-@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+# @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
 class BertMLMHead(BertOnlyMLMHead, TorchHead):
 
     def compute_loss(self, outputs: Dict[str, torch.Tensor],
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 514a04cd..b7a890c1 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,10 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from transformers import BertForMaskedLM as BertForMaskedLMTransformer
-
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import \
+    BertForMaskedLM as BertForMaskedLMTransformer
 from modelscope.models.nlp.deberta_v2 import \
     DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 62198ed2..8b0c59b2 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -41,12 +41,9 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         input_tensor = {
-            'input_ids':
-            torch.tensor(input['input_ids']).unsqueeze(0),
-            'attention_mask':
-            torch.tensor(input['attention_mask']).unsqueeze(0),
-            'label_mask':
-            torch.tensor(input['label_mask'], dtype=torch.bool).unsqueeze(0)
+            'input_ids': input['input_ids'],
+            'attention_mask': input['attention_mask'],
+            'label_mask': input['label_mask'],
         }
         output = {
             'text': input['text'],
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index a8930e68..156c615c 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -7,6 +7,7 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
 from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.models.nlp.veco import \
     VecoForSequenceClassification as VecoForSequenceClassificationTransform
@@ -16,7 +17,10 @@ from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
-__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification']
+__all__ = [
+    'SbertForSequenceClassification', 'VecoForSequenceClassification',
+    'BertForSequenceClassification'
+]
 
 
 class SequenceClassificationBase(TorchModel):
@@ -132,7 +136,7 @@ class SbertForSequenceClassification(SequenceClassificationBase,
             label2id = parse_label_mapping(model_dir)
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
-
+            cls.id2label = {id: label for label, id in label2id.items()}
         model_args = {} if num_labels is None else {'num_labels': num_labels}
         return super(SbertPreTrainedModel,
                      SbertForSequenceClassification).from_pretrained(
@@ -206,3 +210,78 @@ class VecoForSequenceClassification(TorchModel,
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.nli, module_name=Models.bert)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
+class BertForSequenceClassification(SequenceClassificationBase,
+                                    BertPreTrainedModel):
+    """Bert sequence classification model.
+
+        Inherited from SequenceClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(BertPreTrainedModel,
+                     BertForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 7493ba74..90f22aa1 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -5,6 +5,8 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
+    from .feature_extraction import FeatureExtractionModel
+    from .fill_mask import FillMaskModel
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
@@ -12,6 +14,8 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'information_extraction': ['InformationExtractionModel'],
+        'feature_extraction': ['FeatureExtractionModel'],
+        'fill_mask': ['FillMaskModel'],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
new file mode 100644
index 00000000..069c37aa
--- /dev/null
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -0,0 +1,43 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FeatureExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.feature_extraction, module_name=TaskModels.feature_extraction)
+class FeatureExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+
+        return {OutputKeys.TEXT_EMBEDDING: sequence_output}
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
new file mode 100644
index 00000000..f7ef1cc2
--- /dev/null
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FillMaskModel']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask)
+class FillMaskModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS]
+        return outputs
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 4792d07c..0a7d5a47 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -26,21 +26,12 @@ class InformationExtractionModel(SingleBackboneTaskModelBase):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(sequence_output, input['text'],
                                     input['offsets'])
         return {OutputKeys.SPO_LIST: outputs}
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        return sequence_output, pooled_output
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 43a96327..1f5e46c3 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -11,10 +11,14 @@ from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
 
 __all__ = ['SequenceClassificationModel']
 
 
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=TaskModels.text_classification)
+@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification)
 @MODELS.register_module(
     Tasks.sentiment_classification, module_name=TaskModels.text_classification)
 @MODELS.register_module(
@@ -31,49 +35,36 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         if 'base_model_prefix' in kwargs:
             self._base_model_prefix = kwargs['base_model_prefix']
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
         # get the num_labels from label_mapping.json
         self.id2label = {}
-        self.label_path = os.path.join(model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-        head_cfg['num_labels'] = len(self.label_mapping)
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        self.head_cfg['num_labels'] = num_labels
 
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(pooled_output)
-        if 'labels' in input:
-            loss = self.compute_loss(outputs, input['labels'])
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
             outputs.update(loss)
         return outputs
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
 
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        if hasattr(self.backbone, 'extract_pooled_outputs'):
-            pooled_output = self.backbone.extract_pooled_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
-
     def postprocess(self, input, **kwargs):
         logits = self.extract_logits(input)
         probs = logits.softmax(-1).numpy()
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index e93dd5f6..0b43044f 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -74,7 +74,7 @@ class BaseTaskModel(TorchModel, ABC):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-        self.cfg = ConfigDict(kwargs)
+        self.config = ConfigDict(kwargs)
 
     def __repr__(self):
         # only log backbone and head name
@@ -397,6 +397,9 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
+        self.backbone_cfg = self.config.get('backbone', None)
+        assert self.backbone_cfg is not None
+        self.head_cfg = self.config.get('head', None)
 
     def build_backbone(self, cfg):
         if 'prefix' in cfg:
@@ -405,9 +408,13 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
         setattr(self, cfg['prefix'], backbone)
 
     def build_head(self, cfg):
+        if cfg is None:
+            raise ValueError(
+                'Head config is missing, check if this was a backbone-only model'
+            )
         if 'prefix' in cfg:
             self._head_prefix = cfg['prefix']
-        head = build_head(cfg)
+        head = build_head(cfg, group_key=self.group_key)
         setattr(self, self._head_prefix, head)
         return head
 
@@ -431,8 +438,18 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
             outputs = self.backbone.forward(**input)
         return outputs
 
-    def compute_loss(self, outputs: Dict[str, Any], labels):
-        raise NotImplementedError()
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        if hasattr(self.backbone, 'extract_pooled_outputs'):
+            pooled_output = self.backbone.extract_pooled_outputs(outputs)
+        return sequence_output, pooled_output
 
 
 class EncoderDecoderTaskModelBase(BaseTaskModel):
@@ -453,7 +470,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_encoder(self):
         encoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._encoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._encoder_prefix, encoder)
@@ -461,7 +478,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_decoder(self):
         decoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._decoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._decoder_prefix, decoder)
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 5c22098f..f3930182 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -31,9 +31,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
         if 'base_model_prefix' in kwargs:
             self._base_model_prefix = kwargs['base_model_prefix']
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
         # get the num_labels
         num_labels = kwargs.get('num_labels')
         if num_labels is None:
@@ -41,12 +38,12 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
             self.id2label = {id: label for label, id in label2id.items()}
-        head_cfg['num_labels'] = num_labels
+        self.head_cfg['num_labels'] = num_labels
 
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         labels = None
         if OutputKeys.LABEL in input:
             labels = input.pop(OutputKeys.LABEL)
@@ -71,10 +68,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             sequence_output = self.backbone.extract_sequence_outputs(outputs)
         return sequence_output, pooled_output
 
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
-
     def postprocess(self, input, **kwargs):
         logits = self.extract_logits(input)
         pred = torch.argmax(logits[0], dim=-1)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index c3723a61..c63e8037 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -10,12 +10,13 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
-from .structbert import SbertPreTrainedModel
 
 __all__ = ['SbertForTokenClassification']
 
@@ -171,3 +172,49 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
+@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
+class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
+    """Bert token classification model.
+
+        Inherited from TokenClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index ce9e8d07..357afd07 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -417,6 +417,22 @@ TASK_OUTPUTS = {
     # }
     Tasks.fill_mask: [OutputKeys.TEXT],
 
+    # feature extraction result for single sample
+    # {
+    #   "text_embedding": [[
+    #     [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
+    #     [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
+    #     [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
+    #   ],
+    #   [
+    #     [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
+    #     [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
+    #     [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
+    #   ]
+    # ]
+    # }
+    Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
+
     # (Deprecated) dialog intent prediction result for single sample
     # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
     #        1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 51d50d51..4f6873b0 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -52,8 +52,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                    'damo/cv_vit_object-detection_coco'),
     Tasks.image_denoising: (Pipelines.image_denoise,
                             'damo/cv_nafnet_image-denoise_sidd'),
-    Tasks.text_classification: (Pipelines.sentiment_analysis,
-                                'damo/bert-base-sst2'),
+    Tasks.text_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'),
     Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
     Tasks.zero_shot_classification:
@@ -80,6 +81,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.ocr_detection: (Pipelines.ocr_detection,
                           'damo/cv_resnet18_ocr-detection-line-level_damo'),
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
+    Tasks.feature_extraction: (Pipelines.feature_extraction,
+                               'damo/pert_feature-extraction_base-test'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
     Tasks.action_detection: (Pipelines.action_detection,
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index a8edc21a..5267b5b2 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -11,12 +11,13 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
+    from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
-    from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
+    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
@@ -27,8 +28,7 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
-    from .passage_ranking_pipeline import PassageRankingPipeline
-    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
+
 else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
@@ -41,16 +41,15 @@ else:
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
+        'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
         'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
+        'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
-        'information_extraction_pipeline': ['InformationExtractionPipeline'],
-        'pair_sentence_classification_pipeline':
-        ['PairSentenceClassificationPipeline'],
+        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
-        'single_sentence_classification_pipeline':
-        ['SingleSentenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
@@ -61,8 +60,6 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
-        'passage_ranking_pipeline': ['PassageRankingPipeline'],
-        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
new file mode 100644
index 00000000..3af0c28d
--- /dev/null
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -0,0 +1,82 @@
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FeatureExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.feature_extraction, module_name=Pipelines.feature_extraction)
+class FeatureExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported feature extraction task, or a
+            no-head model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large')
+            >>> input = 'Everything you love is treasure'
+            >>> print(pipe_ins(input))
+
+
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = NLPPreprocessor(
+                model.model_dir,
+                padding=kwargs.pop('padding', False),
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+        self.config = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        self.tokenizer = preprocessor.tokenizer
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        return {
+            OutputKeys.TEXT_EMBEDDING:
+            inputs[OutputKeys.TEXT_EMBEDDING].tolist()
+        }
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 12f4b80f..3d515e2d 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -57,7 +57,7 @@ class FillMaskPipeline(Pipeline):
             model, Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 fill_mask_model.model_dir,
                 first_sequence=first_sequence,
                 second_sequence=None,
@@ -118,7 +118,10 @@ class FillMaskPipeline(Pipeline):
         logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
         input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
-        model_type = self.model.config.model_type
+        if hasattr(self.model.config, 'backbone'):
+            model_type = self.model.config.backbone.type
+        else:
+            model_type = self.model.config.model_type
         process_type = model_type if model_type in self.mask_id else _type_map[
             model_type]
         rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 07223d07..763e941c 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -36,7 +36,7 @@ class InformationExtractionPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 467d7aba..7275feca 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -9,7 +9,8 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NERPreprocessor, Preprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
@@ -46,7 +47,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
         if preprocessor is None:
-            preprocessor = NERPreprocessor(
+            preprocessor = TokenClassificationPreprocessor(
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 512))
         model.eval()
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
deleted file mode 100644
index bdb75c73..00000000
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Union
-
-from modelscope.models.base import Model
-from ...metainfo import Pipelines
-from ...preprocessors import (PairSentenceClassificationPreprocessor,
-                              Preprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['PairSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            second_sequence: The key to read the second sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and
-            'second_sequence' param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base')
-            >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
-            >>> sentence2 = '四川商务职业学院商务管理在哪个校区？'
-            >>> print(pipeline_ins((sentence1, sentence2)))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2}))
-
-            To view other examples plese check the tests/pipelines/test_nli.py.
-        """
-        if preprocessor is None:
-            preprocessor = PairSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
index 7fe8aace..8d0e1dcd 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,48 +1,64 @@
 from typing import Any, Dict, Union
 
 import numpy as np
+import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import BertForSequenceClassification
+from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      SequenceClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
-__all__ = ['SequenceClassificationPipeline']
-
 
 @PIPELINES.register_module(
     Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
 class SequenceClassificationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[BertForSequenceClassification, str],
-                 preprocessor: SequenceClassificationPreprocessor = None,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+        """This is the base class for all the sequence classification sub-tasks.
 
         Args:
-            model (BertForSequenceClassification): a model instance
-            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
         """
-        assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \
-            'model must be a single str or BertForSequenceClassification'
-        sc_model = model if isinstance(
-            model,
-            BertForSequenceClassification) else Model.from_pretrained(model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        first_sequence = kwargs.pop('first_sequence', 'first_sequence')
+        second_sequence = kwargs.pop('second_sequence', None)
+
         if preprocessor is None:
             preprocessor = SequenceClassificationPreprocessor(
-                sc_model.model_dir,
-                first_sequence='sentence',
-                second_sequence=None,
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence,
                 sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
 
-        assert hasattr(self.model, 'id2label'), \
-            'id2label map should be initalizaed in init function.'
+        assert preprocessor is not None
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -50,20 +66,18 @@ class SequenceClassificationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): input data dict
-            topk (int): return topk classification result.
-
+            inputs (Dict[str, Any]): _description_
+            topk (int): The topk probs to take
         Returns:
             Dict[str, str]: the prediction results
         """
-        # NxC np.ndarray
-        probs = inputs['probs'][0]
+
+        probs = inputs[OutputKeys.PROBABILITIES][0]
         num_classes = probs.shape[0]
         topk = min(topk, num_classes)
         top_indices = np.argpartition(probs, -topk)[-topk:]
         cls_ids = top_indices[np.argsort(probs[top_indices])]
         probs = probs[cls_ids].tolist()
 
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-
+        cls_names = [self.id2label[cid] for cid in cls_ids]
         return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
deleted file mode 100644
index 3d8e8fea..00000000
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from ...preprocessors import Preprocessor
-from ..base import Pipeline
-
-
-class SequenceClassificationPipelineBase(Pipeline):
-
-    def __init__(self, model: Union[Model, str], preprocessor: Preprocessor,
-                 **kwargs):
-        """This is the base class for all the sequence classification sub-tasks.
-
-        Args:
-            model (str or Model): A model instance or a model local dir or a model id in the model hub.
-            preprocessor (Preprocessor): a preprocessor instance, must not be None.
-        """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-        assert preprocessor is not None
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-            topk (int): The topk probs to take
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs[OutputKeys.PROBABILITIES][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
deleted file mode 100644
index 0a2f6d25..00000000
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Union
-
-from ...metainfo import Pipelines
-from ...models import Model
-from ...preprocessors import (Preprocessor,
-                              SingleSentenceClassificationPreprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['SingleSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentiment_classification,
-    module_name=Pipelines.sentiment_classification)
-class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='sentiment-classification',
-            >>>    model='damo/nlp_structbert_sentiment-classification_chinese-base')
-            >>> sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
-            >>> print(pipeline_ins(sentence1))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1}))
-
-            To view other examples plese check the tests/pipelines/test_sentiment-classification.py.
-        """
-        if preprocessor is None:
-            preprocessor = SingleSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index aabf48d8..5367c1a8 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -49,7 +49,7 @@ class TokenClassificationPipeline(Pipeline):
         text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
             return {
-                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
             }
 
     def postprocess(self, inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index b4be1845..90303b65 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -16,17 +16,23 @@ if TYPE_CHECKING:
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (
-        Tokenize, SequenceClassificationPreprocessor,
-        TextGenerationPreprocessor, TokenClassificationPreprocessor,
-        SingleSentenceClassificationPreprocessor,
-        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
-        ZeroShotClassificationPreprocessor, NERPreprocessor,
-        TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
-        SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
-        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor, SentenceEmbeddingPreprocessor,
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor,
         Text2TextGenerationPreprocessor,
-        WordSegmentationBlankSetToLabelPreprocessor)
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -49,18 +55,22 @@ else:
         'kws': ['WavToLists'],
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
-            'Tokenize', 'SequenceClassificationPreprocessor',
-            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'SingleSentenceClassificationPreprocessor',
-            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-            'TextErrorCorrectionPreprocessor',
-            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
             'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextErrorCorrectionPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
-            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+            'ZeroShotClassificationPreprocessor',
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 8e75ae98..dfbb5c81 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -6,32 +6,41 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
     from .nlp_base import (
-        Tokenize, SequenceClassificationPreprocessor,
-        TextGenerationPreprocessor, TokenClassificationPreprocessor,
-        SingleSentenceClassificationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
-        ZeroShotClassificationPreprocessor, NERPreprocessor,
-        FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
-        RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
-        FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
         SentenceEmbeddingPreprocessor,
-        WordSegmentationBlankSetToLabelPreprocessor)
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor,
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
 
 else:
     _import_structure = {
         'nlp_base': [
-            'Tokenize', 'SequenceClassificationPreprocessor',
-            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'SingleSentenceClassificationPreprocessor',
-            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
             'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
-            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+            'ZeroShotClassificationPreprocessor',
         ],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index d6325eed..6b559de9 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -2,14 +2,13 @@
 
 import os.path as osp
 import re
-import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-from transformers import AutoTokenizer, BertTokenizerFast
+import torch
+from transformers import AutoTokenizer
 
 from modelscope.metainfo import Models, Preprocessors
-from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
@@ -23,24 +22,21 @@ from modelscope.utils.type_assert import type_assert
 logger = get_logger()
 
 __all__ = [
-    'Tokenize',
-    'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor',
-    'TokenClassificationPreprocessor',
-    'PairSentenceClassificationPreprocessor',
-    'Text2TextGenerationPreprocessor',
-    'SingleSentenceClassificationPreprocessor',
-    'FillMaskPreprocessor',
-    'ZeroShotClassificationPreprocessor',
-    'NERPreprocessor',
-    'SentenceEmbeddingPreprocessor',
-    'PassageRankingPreprocessor',
-    'FaqQuestionAnsweringPreprocessor',
-    'SequenceLabelingPreprocessor',
-    'RelationExtractionPreprocessor',
     'DocumentSegmentationPreprocessor',
+    'FaqQuestionAnsweringPreprocessor',
+    'NLPPreprocessor',
     'FillMaskPoNetPreprocessor',
+    'NLPTokenizerPreprocessorBase',
+    'PassageRankingPreprocessor',
+    'RelationExtractionPreprocessor',
+    'SentenceEmbeddingPreprocessor',
+    'SequenceClassificationPreprocessor',
+    'TokenClassificationPreprocessor',
+    'Text2TextGenerationPreprocessor',
+    'TextGenerationPreprocessor',
+    'Tokenize',
     'WordSegmentationBlankSetToLabelPreprocessor',
+    'ZeroShotClassificationPreprocessor',
 ]
 
 
@@ -48,85 +44,19 @@ __all__ = [
 class Tokenize(Preprocessor):
 
     def __init__(self, tokenizer_name) -> None:
-        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
     def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
         if isinstance(data, str):
             data = {InputFields.text: data}
-        token_dict = self._tokenizer(data[InputFields.text])
+        token_dict = self.tokenizer(data[InputFields.text])
         data.update(token_dict)
         return data
 
 
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-class SequenceClassificationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        from easynlp.modelzoo import AutoTokenizer
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-        print(f'this is the tokenzier {self.tokenizer}')
-        self.label2id = parse_label_mapping(self.model_dir)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        feature = super().__call__(data)
-        if isinstance(data, str):
-            new_data = {self.first_sequence: data}
-        elif isinstance(data, tuple):
-            sentence1, sentence2 = data
-            new_data = {
-                self.first_sequence: sentence1,
-                self.second_sequence: sentence2
-            }
-        else:
-            new_data = data
-
-        # preprocess the data for the model input
-
-        rst = {
-            'id': [],
-            'input_ids': [],
-            'attention_mask': [],
-            'token_type_ids': [],
-        }
-
-        max_seq_length = self.sequence_length
-
-        text_a = new_data[self.first_sequence]
-        text_b = new_data.get(self.second_sequence, None)
-
-        feature = self.tokenizer(
-            text_a,
-            text_b,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length)
-
-        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
-        rst['input_ids'].append(feature['input_ids'])
-        rst['attention_mask'].append(feature['attention_mask'])
-        rst['token_type_ids'].append(feature['token_type_ids'])
-        return rst
-
-
 class NLPTokenizerPreprocessorBase(Preprocessor):
 
-    def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs):
+    def __init__(self, model_dir: str, mode: str, **kwargs):
         """The NLP tokenizer preprocessor base class.
 
         Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
@@ -138,7 +68,6 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             label: The label key
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
-            pair (bool): Pair sentence input or single sentence input.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
             kwargs: These kwargs will be directly fed into the tokenizer.
         """
@@ -148,7 +77,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         self.first_sequence: str = kwargs.pop('first_sequence',
                                               'first_sequence')
         self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.pair = pair
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
         self._mode = mode
         self.label = kwargs.pop('label', OutputKeys.LABEL)
         self.label2id = None
@@ -158,6 +88,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             self.label2id = parse_label_mapping(self.model_dir)
 
         self.tokenize_kwargs = kwargs
+
         self.tokenizer = self.build_tokenizer(model_dir)
 
     @property
@@ -179,20 +110,38 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         @param model_dir:  The local model dir.
         @return: The initialized tokenizer.
         """
-
+        self.is_transformer_based_model = 'lstm' not in model_dir
+        # fast version lead to parallel inference failed
         model_type = get_model_type(model_dir)
         if model_type in (Models.structbert, Models.gpt3, Models.palm,
                           Models.plug):
-            from modelscope.models.nlp.structbert import SbertTokenizer
-            return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
+            from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
+            return SbertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained(
+                model_dir)
         elif model_type == Models.veco:
-            from modelscope.models.nlp.veco import VecoTokenizer
-            return VecoTokenizer.from_pretrained(model_dir)
+            from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
+            return VecoTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained(
+                model_dir)
         elif model_type == Models.deberta_v2:
-            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer
-            return DebertaV2Tokenizer.from_pretrained(model_dir)
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
+            return DebertaV2Tokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained(
+                model_dir)
+        elif not self.is_transformer_based_model:
+            from transformers import BertTokenizer, BertTokenizerFast
+            return BertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained(
+                model_dir)
         else:
-            return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
+            return AutoTokenizer.from_pretrained(
+                model_dir,
+                use_fast=False if self._mode == ModeKeys.INFERENCE else True)
 
     def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
         """process the raw input data
@@ -239,7 +188,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             if len(data) == 3:
                 text_a, text_b, labels = data
             elif len(data) == 2:
-                if self.pair:
+                if self._mode == ModeKeys.INFERENCE:
                     text_a, text_b = data
                 else:
                     text_a, labels = data
@@ -277,6 +226,22 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 output[OutputKeys.LABELS] = labels
 
 
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class NLPPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.passage_ranking)
 class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
@@ -337,22 +302,12 @@ class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
     Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in pair sentence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=True, mode=mode, **kwargs)
-
-
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in single sentence classification.
+class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sequence classification.
     """
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
@@ -360,7 +315,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         kwargs['padding'] = kwargs.get(
             'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
 
 @PREPROCESSORS.register_module(
@@ -421,7 +376,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
             model_dir (str): model path
         """
         self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     def __call__(self, data: Union[str, Dict], hypothesis_template: str,
                  candidate_labels: list) -> Dict[str, Any]:
@@ -496,14 +451,12 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
                  tokenizer=None,
                  mode=ModeKeys.INFERENCE,
                  **kwargs):
-        self.tokenizer = self.build_tokenizer(
-            model_dir) if tokenizer is None else tokenizer
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     @staticmethod
     def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
@@ -541,20 +494,6 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
         }
 
 
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-class FillMaskPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-
 @PREPROCESSORS.register_module(
     Fields.nlp,
     module_name=Preprocessors.word_segment_text_to_label_preprocessor)
@@ -592,21 +531,40 @@ class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
 class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in normal token classification task.
+    """The tokenizer preprocessor used in normal NER task.
     """
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get(
             'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
         self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        if 'is_split_into_words' in kwargs:
+            self.is_split_into_words = kwargs.pop('is_split_into_words')
+        else:
+            self.is_split_into_words = self.tokenizer.init_kwargs.get(
+                'is_split_into_words', False)
+        if 'label2id' in kwargs:
+            kwargs.pop('label2id')
+        self.tokenize_kwargs = kwargs
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -618,23 +576,84 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
             Dict[str, Any]: the preprocessed data
         """
 
-        text_a = None
+        # preprocess the data for the model input
+        text = None
         labels_list = None
         if isinstance(data, str):
-            text_a = data
+            text = data
         elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
+            text = data.get(self.first_sequence)
             labels_list = data.get(self.label)
 
-        if isinstance(text_a, str):
-            text_a = text_a.replace(' ', '').strip()
+        input_ids = []
+        label_mask = []
+        offset_mapping = []
+        if self.is_split_into_words:
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)])
+        else:
+            if self.tokenizer.is_fast:
+                encodings = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                word_ids = encodings.word_ids()
+                for i in range(len(word_ids)):
+                    if word_ids[i] is None:
+                        label_mask.append(0)
+                    elif word_ids[i] == word_ids[i - 1]:
+                        label_mask.append(0)
+                        offset_mapping[-1] = (
+                            offset_mapping[-1][0],
+                            encodings['offset_mapping'][i][1])
+                    else:
+                        label_mask.append(1)
+                        offset_mapping.append(encodings['offset_mapping'][i])
+            else:
+                encodings = self.tokenizer(
+                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
+                    text)
 
-        tokenized_inputs = self.tokenizer(
-            [t for t in text_a],
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            is_split_into_words=True,
-            **self.tokenize_kwargs)
+        if len(input_ids) >= self.sequence_length - 2:
+            input_ids = input_ids[:self.sequence_length - 2]
+            label_mask = label_mask[:self.sequence_length - 2]
+        input_ids = [self.tokenizer.cls_token_id
+                     ] + input_ids + [self.tokenizer.sep_token_id]
+        label_mask = [0] + label_mask + [0]
+        attention_mask = [1] * len(input_ids)
+        offset_mapping = offset_mapping[:sum(label_mask)]
 
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+
+        if self._mode == ModeKeys.INFERENCE:
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
+            label_mask = torch.tensor(
+                label_mask, dtype=torch.bool).unsqueeze(0)
+
+        # the token classification
+        output = {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+        # align the labels with tokenized text
         if labels_list is not None:
             assert self.label2id is not None
             # Map that sends B-Xxx label to its I-Xxx counterpart
@@ -653,7 +672,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                     b_to_i_label.append(idx)
 
             label_row = [self.label2id[lb] for lb in labels_list]
-            word_ids = tokenized_inputs.word_ids()
             previous_word_idx = None
             label_ids = []
             for word_idx in word_ids:
@@ -668,229 +686,66 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                         label_ids.append(-100)
                 previous_word_idx = word_idx
             labels = label_ids
-            tokenized_inputs['labels'] = labels
-            # new code end
+            output['labels'] = labels
+        return output
 
-        if self._mode == ModeKeys.INFERENCE:
-            tokenized_inputs[OutputKeys.TEXT] = text_a
-        return tokenized_inputs
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
 
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
-class NERPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal NER task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.is_transformer_based_model = 'lstm' not in model_dir
-        if self.is_transformer_based_model:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_dir, use_fast=True)
-        else:
-            self.tokenizer = BertTokenizerFast.from_pretrained(
-                model_dir, use_fast=True)
-        self.is_split_into_words = self.tokenizer.init_kwargs.get(
-            'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        if self.is_split_into_words:
-            input_ids = []
-            label_mask = []
-            offset_mapping = []
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)]
-                                      + [(offset + 1, offset + 1)]
-                                      * (len(subtoken_ids) - 1))
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-                offset_mapping = offset_mapping[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-        else:
-            encodings = self.tokenizer(
-                text,
-                add_special_tokens=True,
-                padding=True,
-                truncation=True,
-                max_length=self.sequence_length,
-                return_offsets_mapping=True)
-            input_ids = encodings['input_ids']
-            attention_mask = encodings['attention_mask']
-            word_ids = encodings.word_ids()
-            label_mask = []
-            offset_mapping = []
-            for i in range(len(word_ids)):
-                if word_ids[i] is None:
-                    label_mask.append(0)
-                elif word_ids[i] == word_ids[i - 1]:
-                    label_mask.append(0)
-                    offset_mapping[-1] = (offset_mapping[-1][0],
-                                          encodings['offset_mapping'][i][1])
+    def get_label_mask_and_offset_mapping(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.tokenizer.tokenize(text)
+        offset = 0
+        if self.get_tokenizer_class() == 'BertTokenizer':
+            for token in tokens:
+                is_start = (token[:2] != '##')
+                if is_start:
+                    label_mask.append(True)
                 else:
-                    label_mask.append(1)
-                    offset_mapping.append(encodings['offset_mapping'][i])
-
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-        return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
-class SequenceLabelingPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal NER task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-
-        if 'lstm' in model_dir or 'gcnn' in model_dir:
-            self.tokenizer = BertTokenizerFast.from_pretrained(
-                model_dir, use_fast=False)
-        elif 'structbert' in model_dir:
-            self.tokenizer = SbertTokenizerFast.from_pretrained(
-                model_dir, use_fast=False)
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_dir, use_fast=False)
-        self.is_split_into_words = self.tokenizer.init_kwargs.get(
-            'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        if self.is_split_into_words:
-            input_ids = []
-            label_mask = []
-            offset_mapping = []
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)]
-                                      + [(offset + 1, offset + 1)]
-                                      * (len(subtoken_ids) - 1))
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-                offset_mapping = offset_mapping[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-        else:
-            encodings = self.tokenizer(
-                text,
-                add_special_tokens=True,
-                padding=True,
-                truncation=True,
-                max_length=self.sequence_length,
-                return_offsets_mapping=True)
-            input_ids = encodings['input_ids']
-            attention_mask = encodings['attention_mask']
-            word_ids = encodings.word_ids()
-            label_mask = []
-            offset_mapping = []
-            for i in range(len(word_ids)):
-                if word_ids[i] is None:
-                    label_mask.append(0)
-                elif word_ids[i] == word_ids[i - 1]:
-                    label_mask.append(0)
-                    offset_mapping[-1] = (offset_mapping[-1][0],
-                                          encodings['offset_mapping'][i][1])
+                    token = token[2:]
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if is_start:
+                    offset_mapping.append((start, end))
                 else:
-                    label_mask.append(1)
-                    offset_mapping.append(encodings['offset_mapping'][i])
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+            last_is_blank = False
+            for token in tokens:
+                is_start = (token[0] == '▁')
+                if is_start:
+                    token = token[1:]
+                    label_mask.append(True)
+                    if len(token) == 0:
+                        last_is_blank = True
+                        continue
+                else:
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if last_is_blank or is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+                last_is_blank = False
+        else:
+            raise NotImplementedError
 
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-        return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
+        return label_mask, offset_mapping
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.re_tokenizer)
 class RelationExtractionPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal RE task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
+    """The relation extraction preprocessor used in normal RE task.
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -937,7 +792,7 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super(FaqQuestionAnsweringPreprocessor, self).__init__(
-            model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs)
+            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
         import os
         from transformers import BertTokenizer
 
@@ -1026,7 +881,7 @@ class DocumentSegmentationPreprocessor(Preprocessor):
         """
 
         super().__init__(*args, **kwargs)
-
+        from transformers import BertTokenizerFast
         self.tokenizer = BertTokenizerFast.from_pretrained(
             model_dir,
             use_fast=True,
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 75add1d9..b19c0fce 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -115,6 +115,7 @@ class NLPTasks(object):
     conversational_text_to_sql = 'conversational-text-to-sql'
     information_extraction = 'information-extraction'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
 
 class AudioTasks(object):
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 3cf88114..7a9c79e2 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -74,7 +74,6 @@ class Registry(object):
             raise KeyError(f'{module_name} is already registered in '
                            f'{self._name}[{group_key}]')
         self._modules[group_key][module_name] = module_cls
-        module_cls.group_key = group_key
 
     def register_module(self,
                         group_key: str = default_group,
@@ -196,6 +195,7 @@ def build_from_cfg(cfg,
         if obj_cls is None:
             raise KeyError(f'{obj_type} is not in the {registry.name}'
                            f' registry group {group_key}')
+        obj_cls.group_key = group_key
     elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type
     else:
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 762530f4..91a3b5c5 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -75,7 +75,8 @@ class MsDatasetTest(unittest.TestCase):
         preprocessor = SequenceClassificationPreprocessor(
             nlp_model.model_dir,
             first_sequence='premise',
-            second_sequence=None)
+            second_sequence=None,
+            padding='max_length')
         ms_ds_train = MsDataset.load(
             'xcopa',
             subset_name='translation-et',
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
index 4f3206cd..549d2cb3 100644
--- a/tests/pipelines/test_deberta_tasks.py
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -6,11 +6,9 @@ import torch
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import DebertaV2ForMaskedLM
-from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer,
-                                              DebertaV2TokenizerFast)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -24,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id_deberta)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -40,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
         # sbert
         print(self.model_id_deberta)
         model = Model.from_pretrained(self.model_id_deberta)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py
new file mode 100644
index 00000000..39291e76
--- /dev/null
+++ b/tests/pipelines/test_feature_extraction.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import FeatureExtractionModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FeatureExtractionPipeline
+from modelscope.preprocessors import NLPPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FeatureExtractionTaskModelTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.feature_extraction
+        self.model_id = 'damo/pert_feature-extraction_base-test'
+
+    sentence1 = '测试embedding'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = NLPPreprocessor(cache_path, padding=False)
+        model = FeatureExtractionModel.from_pretrained(self.model_id)
+        pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline1(input=self.sentence1)
+
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+        result = pipeline2(input=self.sentence1)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = NLPPreprocessor(model.model_dir, padding=False)
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=self.model_id)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.feature_extraction)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index cec8966f..0e5e242b 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -1,13 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
+from regex import R
+
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
                                    VecoForMaskedLM)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -51,7 +53,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
         # sbert
         for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
             model = StructBertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -66,7 +68,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model_dir = snapshot_download(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -80,13 +82,28 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
+        # bert
+        language = 'zh'
+        model_dir = snapshot_download(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = Model.from_pretrained(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_texts[language]
+        test_input = self.test_inputs[language]
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
+
         # sbert
         for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model.model_dir,
                 first_sequence='sentence',
                 second_sequence=None)
@@ -100,7 +117,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -113,6 +130,18 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
+        # bert
+        language = 'zh'
+        model = Model.from_pretrained(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        pipeline_ins.model, f'fill_mask_bert_{language}'
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
@@ -131,6 +160,16 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
+        # Bert
+        language = 'zh'
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask,
+            model=self.model_id_bert,
+            model_revision='beta')
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 9fae2d09..3658cf3f 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -7,7 +7,7 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
                                    TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
-from modelscope.preprocessors import NERPreprocessor
+from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -26,7 +26,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.tcrf_model_id)
-        tokenizer = NERPreprocessor(cache_path)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -43,7 +43,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.lcrf_model_id)
-        tokenizer = NERPreprocessor(cache_path)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
         model = LSTMCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -60,7 +60,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_tcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.tcrf_model_id)
-        tokenizer = NERPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -70,7 +70,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.lcrf_model_id)
-        tokenizer = NERPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index a53ac3b3..db4b9912 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -26,9 +26,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
@@ -40,7 +40,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 4079455d..288d38c7 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -26,9 +26,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
@@ -43,7 +43,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 3db9971a..d0b1b40f 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -6,8 +6,8 @@ from modelscope.models import Model
 from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
-from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -17,23 +17,21 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
                                            DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.sentiment_classification
+        self.task = Tasks.text_classification
         self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
 
     sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
-        cache_path = snapshot_download(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        cache_path = snapshot_download(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
-            self.model_id, num_labels=2)
-        pipeline1 = SingleSentenceClassificationPipeline(
+            self.model_id, num_labels=2, revision='beta')
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.sentiment_classification,
-            model=model,
-            preprocessor=tokenizer)
+            Tasks.text_classification, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
         print(f'sentence1: {self.sentence1}\n'
@@ -41,10 +39,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
+        model = Model.from_pretrained(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification,
+            task=Tasks.text_classification,
             model=model,
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
@@ -54,14 +52,17 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification, model=self.model_id)
+            task=Tasks.text_classification,
+            model=self.model_id,
+            model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification, model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 71b9f3e2..39dbac99 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -12,6 +12,7 @@ from modelscope.utils.test_utils import test_level
 
 
 class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    sentence1 = 'i like this wonderful place'
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
@@ -46,7 +47,8 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.text_classification,
             model=model,
             preprocessor=preprocessor)
-        self.predict(pipeline_ins)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
 
     # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     @unittest.skip('nlp model does not support tensor input, skipped')
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index 4271e201..f9f4d93f 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -32,6 +32,82 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
+    def test_token_classification_tokenize(self):
+        with self.subTest(tokenizer_type='bert'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='bert-base-cased',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678,
+                1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470,
+                119, 102
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, True, True,
+                False, True, True, True, True, True, True, True, True, True,
+                True, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 39),
+                                                        (39, 40), (41, 44),
+                                                        (45, 49), (50, 53),
+                                                        (54, 60), (61, 64),
+                                                        (65, 70), (71, 73),
+                                                        (74, 79), (79, 80)])
+
+        with self.subTest(tokenizer_type='roberta'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='xlm-roberta-base',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239,
+                99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56,
+                5, 2
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, False, True,
+                True, False, False, False, True, True, True, True, False, True,
+                True, True, True, False, False, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 40),
+                                                        (41, 44), (45, 49),
+                                                        (50, 53), (54, 60),
+                                                        (61, 64), (65, 70),
+                                                        (71, 73), (74, 80)])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index de99a7b8..9a8ab828 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -30,7 +30,7 @@ class AstScaningTest(unittest.TestCase):
     def test_ast_scaning_class(self):
         astScaner = AstScaning()
         pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
-                                     'sequence_classification_pipeline.py')
+                                     'text_generation_pipeline.py')
         output = astScaner.generate_ast(pipeline_file)
         self.assertTrue(output['imports'] is not None)
         self.assertTrue(output['from_imports'] is not None)
@@ -40,14 +40,12 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(imports, dict)
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
-        self.assertListEqual(
-            list(set(imports.keys()) - set(['typing', 'numpy'])), [])
-        self.assertEqual(len(from_imports.keys()), 9)
+        self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
+        self.assertEqual(len(from_imports.keys()), 7)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
-        self.assertEqual(
-            decorators,
-            [('PIPELINES', 'text-classification', 'sentiment-analysis')])
+        self.assertEqual(decorators,
+                         [('PIPELINES', 'text-generation', 'text-generation')])
 
     def test_files_scaning_method(self):
         fileScaner = FilesAstScaning()

From 91231b3c157ac875f67e2bbd420a8810da0c0e36 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:09:13 +0800
Subject: [PATCH 155/175] [to #42322933]add copyright on
 mogface,retinaface,mtcnn,ulfd pipeline         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10266086

---
 modelscope/pipelines/cv/mog_face_detection_pipeline.py    | 1 +
 modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py  | 1 +
 modelscope/pipelines/cv/retina_face_detection_pipeline.py | 1 +
 modelscope/pipelines/cv/ulfd_face_detection_pipeline.py   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
index 8797ad12..124b605b 100644
--- a/modelscope/pipelines/cv/mog_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
index 57bf9920..bda46a70 100644
--- a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
index b8c64405..40f2336a 100644
--- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
index 1263082b..e9901d64 100644
--- a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 

From 3d41d6d6208edfcdb7cf7c00c571e0579405cde7 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:22:46 +0800
Subject: [PATCH 156/175] [to #42322933] fix seg4demo         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10189886

---
 .../image_panoptic_segmentation/panseg_model.py |  3 +--
 .../pan_merge/__init__.py                       |  1 +
 .../pan_merge/maskformer_semantic_head.py       |  1 +
 .../semantic_seg_model.py                       |  1 +
 .../vit_adapter/__init__.py                     |  2 ++
 .../vit_adapter/models/__init__.py              |  2 ++
 .../vit_adapter/models/backbone/__init__.py     |  2 ++
 .../models/backbone/adapter_modules.py          | 17 ++++++++---------
 .../models/backbone/base/__init__.py            |  2 ++
 .../vit_adapter/models/backbone/base/beit.py    |  6 ++----
 .../vit_adapter/models/backbone/beit_adapter.py | 13 ++++++-------
 .../vit_adapter/models/decode_heads/__init__.py |  2 ++
 .../models/decode_heads/base_decode_head.py     |  5 ++---
 .../decode_heads/mask2former_head_from_mmseg.py |  5 ++---
 .../vit_adapter/models/segmentors/__init__.py   |  2 ++
 .../models/segmentors/base_segmentor.py         |  5 ++---
 .../segmentors/encoder_decoder_mask2former.py   |  5 ++---
 .../vit_adapter/utils/__init__.py               |  2 ++
 .../vit_adapter/utils/builder.py                |  5 ++---
 .../vit_adapter/utils/seg_func.py               |  5 ++---
 .../cv/image_panoptic_segmentation_pipeline.py  | 16 +++++++---------
 .../cv/image_semantic_segmentation_pipeline.py  | 17 ++++++-----------
 22 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
index f9022f90..f44c01e8 100644
--- a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import torch
@@ -49,6 +50,4 @@ class SwinLPanopticSegmentation(TorchModel):
         return results
 
     def forward(self, Inputs):
-        import pdb
-        pdb.set_trace()
         return self.model(**Inputs)
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
index 2a75f318..6a31a308 100644
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .maskformer_semantic_head import MaskFormerSemanticHead
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
index 6769ebaf..2f3364d0 100644
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn.functional as F
 from mmdet.models.builder import HEADS
diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
index 60acf28f..2b38ebad 100644
--- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
index 82eec1c6..3b9a301c 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .models import backbone, decode_heads, segmentors
 from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
                     seg_resize)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
index ae5c5acf..791dd26f 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .backbone import BASEBEiT, BEiTAdapter
 from .decode_heads import Mask2FormerHeadFromMMSeg
 from .segmentors import EncoderDecoderMask2Former
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
index ab4258c1..7abd0ef1 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .base import BASEBEiT
 from .beit_adapter import BEiTAdapter
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
index 03080342..cf30cca0 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import logging
 from functools import partial
@@ -417,7 +416,7 @@ class SpatialPriorModule(nn.Module):
         self.stem = nn.Sequential(*[
             nn.Conv2d(
                 3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.Conv2d(
                 inplanes,
@@ -426,7 +425,7 @@ class SpatialPriorModule(nn.Module):
                 stride=1,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.Conv2d(
                 inplanes,
@@ -435,7 +434,7 @@ class SpatialPriorModule(nn.Module):
                 stride=1,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         ])
@@ -447,7 +446,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(2 * inplanes),
+            nn.BatchNorm2d(2 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.conv3 = nn.Sequential(*[
@@ -458,7 +457,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(4 * inplanes),
+            nn.BatchNorm2d(4 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.conv4 = nn.Sequential(*[
@@ -469,7 +468,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(4 * inplanes),
+            nn.BatchNorm2d(4 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.fc1 = nn.Conv2d(
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
index 40b0fa89..5b33031f 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .beit import BASEBEiT
 
 __all__ = ['BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
index a5811fb9..62f873ec 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -1,7 +1,5 @@
-# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
-# Github source: https://github.com/microsoft/unilm/tree/master/beit
-# This implementation refers to
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import math
 from functools import partial
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
index 02a4968e..182fc0c1 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import logging
 import math
 
@@ -69,10 +68,10 @@ class BEiTAdapter(BASEBEiT):
         ])
 
         self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
-        self.norm1 = nn.SyncBatchNorm(embed_dim)
-        self.norm2 = nn.SyncBatchNorm(embed_dim)
-        self.norm3 = nn.SyncBatchNorm(embed_dim)
-        self.norm4 = nn.SyncBatchNorm(embed_dim)
+        self.norm1 = nn.BatchNorm2d(embed_dim)
+        self.norm2 = nn.BatchNorm2d(embed_dim)
+        self.norm3 = nn.BatchNorm2d(embed_dim)
+        self.norm4 = nn.BatchNorm2d(embed_dim)
 
         self.up.apply(self._init_weights)
         self.spm.apply(self._init_weights)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
index 9367806f..12bf2a21 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg
 
 __all__ = ['Mask2FormerHeadFromMMSeg']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
index 36660520..ae7a0416 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from abc import ABCMeta, abstractmethod
 
 import torch
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
index ad8b1586..c0681d2b 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import copy
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
index 1f2c8b04..18bbce0d 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .encoder_decoder_mask2former import EncoderDecoderMask2Former
 
 __all__ = ['EncoderDecoderMask2Former']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
index 8bd8fa3f..311352c2 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import warnings
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
index 9287e8aa..50492374 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
index dec8a5f2..9c4d5c4c 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .builder import build_pixel_sampler
 from .data_process_func import ResizeToMultiple
 from .seg_func import add_prefix, seg_resize
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
index 63d77fea..0603ef94 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from mmcv.utils import Registry, build_from_cfg
 
 PIXEL_SAMPLERS = Registry('pixel sampler')
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
index fba46b81..db564cca 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import warnings
 
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
index 9ffc2b03..b96e709c 100644
--- a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -4,11 +4,13 @@ from typing import Any, Dict, Union
 import cv2
 import numpy as np
 import PIL
+import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -39,28 +41,24 @@ class ImagePanopticSegmentationPipeline(Pipeline):
         # build the data pipeline
 
         if isinstance(input, str):
-            # input is str, file names, pipeline loadimagefromfile
-            # collect data
-            data = dict(img_info=dict(filename=input), img_prefix=None)
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
         elif isinstance(input, PIL.Image.Image):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             img = np.array(input.convert('RGB'))
-            # collect data
-            data = dict(img=img)
         elif isinstance(input, np.ndarray):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             if len(input.shape) == 2:
                 img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
             else:
                 img = input
-            img = img[:, :, ::-1]  # in rgb order
-            # collect data
-            data = dict(img=img)
-
         else:
             raise TypeError(f'input should be either str, PIL.Image,'
                             f' np.array, but got {type(input)}')
 
+        # collect data
+        data = dict(img=img)
         cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
         test_pipeline = Compose(cfg.data.test.pipeline)
 
diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
index e3e1fd6b..023d9712 100644
--- a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,28 +41,24 @@ class ImageSemanticSegmentationPipeline(Pipeline):
         # build the data pipeline
 
         if isinstance(input, str):
-            # input is str, file names, pipeline loadimagefromfile
-            # collect data
-            data = dict(img_info=dict(filename=input), img_prefix=None)
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
         elif isinstance(input, PIL.Image.Image):  # BGR
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             img = np.array(input)[:, :, ::-1]
-            # collect data
-            data = dict(img=img)
         elif isinstance(input, np.ndarray):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             if len(input.shape) == 2:
                 img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
             else:
                 img = input
-            # collect data
-            data = dict(img=img)
-
         else:
             raise TypeError(f'input should be either str, PIL.Image,'
                             f' np.array, but got {type(input)}')
 
-        # data = dict(img=input)
+        # collect data
+        data = dict(img=img)
         cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
         test_pipeline = Compose(cfg.data.test.pipeline)
 
@@ -80,11 +77,9 @@ class ImageSemanticSegmentationPipeline(Pipeline):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         results = self.model.inference(input)
-
         return results
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-
         results = self.model.postprocess(inputs)
         outputs = {
             OutputKeys.MASKS: results[OutputKeys.MASKS],

From a3598f8d8c09ced380c9393d5c5208ef65aa13dd Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:24:58 +0800
Subject: [PATCH 157/175] [to #42322933] Fix rouge metrics for chinese text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复 TextGenerationMetric 中 Rouge 指标计算中文时结果不正确的问题

为文本生成添加 BLEU 指标
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10254323
---
 modelscope/metrics/builder.py                |  4 ++
 modelscope/metrics/text_generation_metric.py | 62 +++++++++++++++-----
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 800e3508..9e875cc4 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -18,6 +18,10 @@ class MetricKeys(object):
     SSIM = 'ssim'
     AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
+    BLEU_1 = 'bleu-1'
+    BLEU_4 = 'bleu-4'
+    ROUGE_1 = 'rouge-1'
+    ROUGE_L = 'rouge-l'
 
 
 task_default_metrics = {
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index f154281d..90b80425 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,11 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Dict
+from typing import Dict, Iterable, List
+
+from nltk.translate.bleu_score import sentence_bleu
+from rouge import Rouge
 
 from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.utils.registry import default_group
-from .base import Metric
-from .builder import METRICS, MetricKeys
 
 
 @METRICS.register_module(
@@ -17,20 +20,49 @@ class TextGenerationMetric(Metric):
     """
 
     def __init__(self):
-        self.preds = []
-        self.tgts = []
-        from rouge_score import rouge_scorer
-        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+        self.preds: List[str] = []
+        self.tgts: List[str] = []
+        self.rouge = Rouge()
 
-    def add(self, outputs: Dict, inputs: Dict):
+    @staticmethod
+    def is_chinese_char(char: str):
+        # the length of char must be 1
+        return '\u4e00' <= char <= '\u9fa5'
+
+    # add space for each chinese char
+    def rebuild_str(self, string: str):
+        return ' '.join(''.join([
+            f' {char} ' if self.is_chinese_char(char) else char
+            for char in string
+        ]).split())
+
+    def add(self, outputs: Dict[str, List[str]], inputs: Dict = None):
         ground_truths = outputs['tgts']
         eval_results = outputs['preds']
-        self.preds.extend(eval_results)
-        self.tgts.extend(ground_truths)
+        for truth in ground_truths:
+            self.tgts.append(self.rebuild_str(truth))
+        for result in eval_results:
+            self.preds.append(self.rebuild_str(result))
 
     def evaluate(self):
-        scores = [
-            self.scorer.score(pred, tgt)['rougeL'].fmeasure
-            for pred, tgt in zip(self.preds, self.tgts)
-        ]
-        return {MetricKeys.F1: sum(scores) / len(scores)}
+
+        def mean(iter: Iterable) -> float:
+            return sum(iter) / len(self.preds)
+
+        rouge_scores = self.rouge.get_scores(hyps=self.preds, refs=self.tgts)
+        rouge_1 = mean(map(lambda score: score['rouge-1']['f'], rouge_scores))
+        rouge_l = mean(map(lambda score: score['rouge-l']['f'], rouge_scores))
+        pred_split = tuple(pred.split(' ') for pred in self.preds)
+        tgt_split = tuple(tgt.split(' ') for tgt in self.tgts)
+        bleu_1 = mean(
+            sentence_bleu([tgt], pred, weights=(1, 0, 0, 0))
+            for pred, tgt in zip(pred_split, tgt_split))
+        bleu_4 = mean(
+            sentence_bleu([tgt], pred)
+            for pred, tgt in zip(pred_split, tgt_split))
+        return {
+            MetricKeys.ROUGE_1: rouge_1,
+            MetricKeys.ROUGE_L: rouge_l,
+            MetricKeys.BLEU_1: bleu_1,
+            MetricKeys.BLEU_4: bleu_4
+        }

From 11b33164c33cc3fae3a195037a278c3cb87484a6 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 28 Sep 2022 09:26:44 +0800
Subject: [PATCH 158/175]  [to #42322933] disable t5 test temporarily

---
 tests/pipelines/test_text2text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index 04cecf93..a39562f5 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -30,7 +30,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
         )
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_pipeline_with_model_instance(self):
         model = Model.from_pretrained(self.model_id)
         preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
@@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             preprocessor=preprocessor)
         print(pipeline_ins(self.input))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation, model=self.model_id)

From c51b74c2ea6f2c736955a34599a745b2cd0d02a3 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 28 Sep 2022 13:36:09 +0800
Subject: [PATCH 159/175] [to #45220645]fix: fix ffmpeg mp4 encoder bug        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10284398

    * [to #45220645]fix: fix ffmpeg mp4 encoder bug
---
 docker/Dockerfile.ubuntu | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index e0bfa908..a9a409b5 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -34,7 +34,8 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${a
     cp /tmp/resources/conda.tuna  ~/.condarc && \
     source /root/.bashrc && \
     conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
 
 ARG USE_GPU=True
 
@@ -42,15 +43,15 @@ ARG USE_GPU=True
 ARG TORCH_VERSION=1.12.0
 ARG CUDATOOLKIT_VERSION=11.3
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113; \
     else \
-        conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \
     fi
 
 # install tensorflow
 ARG TENSORFLOW_VERSION=1.15.5
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+        pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
     else \
         pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
     fi
@@ -75,9 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
-    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \

From 0e52a20d2889bca5c0f8165d3013bd46de4afccc Mon Sep 17 00:00:00 2001
From: "chaojie.mcj" <chaojie.mcj@alibaba-inc.com>
Date: Wed, 28 Sep 2022 14:30:37 +0800
Subject: [PATCH 160/175] [to #42322933]update license
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

以下算法进行了header变更：
modelscope.models.cv.cmdssl_video_embedding
modelscope.models.cv.action_recognition
modelscope.models.cv.animal_recognition
modelscope.models.multi_modal.multi_stage_diffusion
modelscope.models.multi_modal.gemm

modelscope.pipelines.cv.live_category_pipeline
modelscope.pipelines.cv.video_category_pipeline
modelscope.models.cv.image_to_image_translation
modelscope.models.cv.image_to_image_generation

modelscope.models.cv.video_inpainting
modelscope.models.multi_modal.diffusion
modelscope.models.multi_modal.team
modelscope.models.cv.shop_segmentation
modelscope.models.cv.text_driven_segmentation
modelscope.models.cv.action_recognition


modelscope.models.cv.face_emotion
modelscope.models.cv.hand_static
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10268474
---
 .../models/cv/action_recognition/models.py      |  3 +++
 modelscope/models/cv/action_recognition/s3dg.py |  3 +++
 .../cv/action_recognition/tada_convnext.py      |  4 ++++
 .../models/cv/animal_recognition/resnet.py      |  3 +++
 .../models/cv/animal_recognition/splat.py       |  3 +++
 .../cv/cmdssl_video_embedding/__init__.py       |  3 ++-
 .../models/cv/cmdssl_video_embedding/c3d.py     |  8 ++++++++
 .../cv/cmdssl_video_embedding/resnet2p1d.py     |  8 ++++++++
 .../cv/cmdssl_video_embedding/resnet3d.py       |  8 ++++++++
 .../models/cv/shop_segmentation/common.py       | 14 ++++++--------
 .../models/cv/shop_segmentation/head_fpn.py     | 14 ++++++--------
 .../models/cv/shop_segmentation/models.py       | 14 ++++++--------
 .../models/cv/shop_segmentation/neck_fpn.py     | 14 ++++++--------
 .../cv/shop_segmentation/shop_seg_base.py       | 14 ++++++--------
 .../cv/shop_segmentation/shop_seg_model.py      |  2 ++
 modelscope/models/cv/shop_segmentation/utils.py |  7 +++----
 .../cv/text_driven_segmentation/__init__.py     |  1 +
 .../models/cv/text_driven_segmentation/clip.py  |  7 +++----
 .../cv/text_driven_segmentation/lseg_base.py    |  6 ++----
 .../cv/text_driven_segmentation/lseg_blocks.py  |  6 ++----
 .../cv/text_driven_segmentation/lseg_model.py   |  2 ++
 .../cv/text_driven_segmentation/lseg_net.py     |  6 ++----
 .../cv/text_driven_segmentation/lseg_vit.py     |  6 ++----
 .../models/cv/text_driven_segmentation/model.py |  6 ++----
 .../simple_tokenizer.py                         |  7 +++----
 .../models/multi_modal/diffusion/diffusion.py   |  3 +++
 .../models/multi_modal/diffusion/model.py       |  1 +
 .../multi_modal/diffusion/unet_generator.py     |  3 +++
 .../diffusion/unet_upsampler_1024.py            |  3 +++
 .../multi_modal/diffusion/unet_upsampler_256.py |  3 +++
 modelscope/models/multi_modal/gemm/gemm_base.py | 17 +++++++++++------
 .../models/multi_modal/gemm/gemm_model.py       |  2 ++
 modelscope/models/multi_modal/gemm/tokenizer.py | 12 ++++++++----
 modelscope/models/multi_modal/mmr/__init__.py   |  2 ++
 .../mmr/dataloaders/rawvideo_util.py            |  3 +++
 .../models/multi_modal/mmr/models/__init__.py   |  2 ++
 .../mmr/models/clip_for_mm_video_embedding.py   |  3 +++
 .../mmr/models/dynamic_inverted_softmax.py      |  3 +++
 .../models/multi_modal/mmr/models/modeling.py   |  2 ++
 .../multi_modal/mmr/models/module_clip.py       |  3 ++-
 .../multi_modal/mmr/models/module_cross.py      |  3 +++
 .../multi_modal/mmr/models/tokenization_clip.py |  3 +++
 .../multi_modal/multi_stage_diffusion/clip.py   |  3 ++-
 .../multi_stage_diffusion/decoder.py            |  2 +-
 .../multi_stage_diffusion/gaussian_diffusion.py |  5 +++--
 .../multi_modal/multi_stage_diffusion/model.py  |  2 +-
 .../multi_modal/multi_stage_diffusion/prior.py  |  2 +-
 .../multi_stage_diffusion/tokenizer.py          |  3 ++-
 .../multi_stage_diffusion/upsampler.py          |  2 +-
 .../multi_modal/multi_stage_diffusion/xglm.py   |  5 +++--
 .../models/multi_modal/team/team_model.py       |  1 +
 modelscope/models/multi_modal/team/utils.py     | 11 +++++++----
 .../pipelines/cv/animal_recognition_pipeline.py |  1 +
 .../cv/cmdssl_video_embedding_pipeline.py       |  2 ++
 .../cv/general_recognition_pipeline.py          |  1 +
 .../pipelines/cv/live_category_pipeline.py      |  2 +-
 .../pipelines/cv/shop_segmentation_pipleline.py |  1 +
 .../cv/text_driven_segmentation_pipleline.py    |  1 +
 .../pipelines/cv/video_category_pipeline.py     |  2 +-
 ...generative_multi_modal_embedding_pipeline.py |  2 +-
 .../team_multi_modal_similarity_pipeline.py     |  3 +--
 tests/pipelines/test_cmdssl_video_embedding.py  |  2 +-
 .../test_generative_multi_modal_embedding.py    |  2 +-
 tests/pipelines/test_multi_modal_similarity.py  |  2 +-
 64 files changed, 188 insertions(+), 106 deletions(-)

diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
index a5964e21..f16805fb 100644
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,3 +1,6 @@
+# The implementation is also open-sourced by the authors,
+# and available at https://github.com/alibaba-mmai-research/TAdaConv
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch.nn as nn
 
 from .s3dg import Inception3D
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
index f258df16..46e76892 100644
--- a/modelscope/models/cv/action_recognition/s3dg.py
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from https://github.com/TengdaHan/CoCLR,
+# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py
index 379b5271..b1de7af8 100644
--- a/modelscope/models/cv/action_recognition/tada_convnext.py
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -1,3 +1,7 @@
+# The implementation is adopted from https://github.com/facebookresearch/ConvNeXt,
+# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
+
 import math
 
 import torch
diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py
index 73953de4..d7c03c29 100644
--- a/modelscope/models/cv/animal_recognition/resnet.py
+++ b/modelscope/models/cv/animal_recognition/resnet.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py
 import math
 
 import torch
diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py
index 0aab555e..a10d0abe 100644
--- a/modelscope/models/cv/animal_recognition/splat.py
+++ b/modelscope/models/cv/animal_recognition/splat.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py
 """Split-Attention"""
 
 import torch
diff --git a/modelscope/models/cv/cmdssl_video_embedding/__init__.py b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
index e7e156a5..5bc67b63 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/__init__.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/cmdssl_video_embedding/c3d.py b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
index 62f0e0b9..53dd05a1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/c3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
@@ -1,3 +1,11 @@
+# Copyright 2022 Davide Abati.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on c3d-pytorch,
+# originally MIT License, Copyright (c) 2022 Davide Abati,
+# and publicly available at https://github.com/DavideA/c3d-pytorch
+""" C3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
index 3b03cc74..b49069d1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet2p1d.py
+""" ResNet2plus1d Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
index 24d50a8e..dddba06f 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet.py
+""" ResNet3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py
index 00ba9996..8cb940a5 100644
--- a/modelscope/models/cv/shop_segmentation/common.py
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import warnings
 
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
index b3faa9b8..cad389c7 100644
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -1,11 +1,9 @@
-""" FPNHead
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index 171aafbd..3880d074 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import math
 from collections import OrderedDict
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
index 108cb043..aa4d7159 100644
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -1,11 +1,9 @@
-""" FPNneck
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
index e3ae0d54..34686370 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_base.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
index 0aeeb1de..ac0d67fa 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py
index c41f8a65..4035b0ef 100644
--- a/modelscope/models/cv/shop_segmentation/utils.py
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -1,7 +1,6 @@
-""" CLIP Tokenizer
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# CLIP Tokenizer
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import gzip
 import html
diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py
index 46daad78..aefaa698 100644
--- a/modelscope/models/cv/text_driven_segmentation/__init__.py
+++ b/modelscope/models/cv/text_driven_segmentation/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .lseg_base import TextDrivenSegmentation
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
index 440cccea..1cec5f39 100644
--- a/modelscope/models/cv/text_driven_segmentation/clip.py
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -1,7 +1,6 @@
-""" CLIP
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+#  CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import hashlib
 import os
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
index 20915396..c79861a7 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_base.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
index cb550ab7..56d4a65d 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
index 1d7ebdd1..9a5754c6 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_model.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
index 1a558c5c..541a4a38 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_net.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
index be2813c2..5298832f 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import math
 import types
diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py
index ece10bab..f98d480d 100644
--- a/modelscope/models/cv/text_driven_segmentation/model.py
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 from collections import OrderedDict
 from typing import Tuple, Union
diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
index 250d680f..361d67c6 100644
--- a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -1,7 +1,6 @@
-""" CLIP
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/diffusion/diffusion.py b/modelscope/models/multi_modal/diffusion/diffusion.py
index d71fe0ae..bfe7baf7 100644
--- a/modelscope/models/multi_modal/diffusion/diffusion.py
+++ b/modelscope/models/multi_modal/diffusion/diffusion.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 8617b8dd..4229391f 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/diffusion/unet_generator.py b/modelscope/models/multi_modal/diffusion/unet_generator.py
index 9b507223..539d3996 100644
--- a/modelscope/models/multi_modal/diffusion/unet_generator.py
+++ b/modelscope/models/multi_modal/diffusion/unet_generator.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
index 1c66b2fe..38cff6a2 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
index 0da8b805..ca5cd7d6 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 from functools import partial
 
diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index db928212..09ef2480 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -1,9 +1,14 @@
-""" Generative Multimodal Model
-Base modules are adapted from https://github.com/openai/CLIP/,
-originally MIT License, Copyright (c) 2021 OpenAI,
-and adapted from https://github.com/lucidrains/CoCa-pytorch/,
-originally MIT License, Copyright (c) 2022 Phil Wang.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2022 Phil Wang.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+# The implementation here is modified based on Coca-pytorch,
+# originally MIT License, Copyright (c) 2022 Phil Wang,
+# and publicly available at https://github.com/lucidrains/CoCa-pytorch/,
+""" Generative Multimodal Model Architecture."""
 
 import os
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/gemm/gemm_model.py b/modelscope/models/multi_modal/gemm/gemm_model.py
index 356dc8d3..55b211c0 100644
--- a/modelscope/models/multi_modal/gemm/gemm_model.py
+++ b/modelscope/models/multi_modal/gemm/gemm_model.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+""" Generative Multimodal Model Wrapper."""
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/gemm/tokenizer.py b/modelscope/models/multi_modal/gemm/tokenizer.py
index af962ceb..8b7cc094 100644
--- a/modelscope/models/multi_modal/gemm/tokenizer.py
+++ b/modelscope/models/multi_modal/gemm/tokenizer.py
@@ -1,7 +1,11 @@
-""" CLIP Tokenizer
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+""" CLIP Tokenizer."""
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/mmr/__init__.py b/modelscope/models/multi_modal/mmr/__init__.py
index c5fb7419..9dac8409 100644
--- a/modelscope/models/multi_modal/mmr/__init__.py
+++ b/modelscope/models/multi_modal/mmr/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .models import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
index eab1189f..c7ac3f94 100644
--- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
+++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Huaishao Luo,
+# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
+
 import cv2
 import numpy as np
 import torch as th
diff --git a/modelscope/models/multi_modal/mmr/models/__init__.py b/modelscope/models/multi_modal/mmr/models/__init__.py
index 6cd06bcd..da832719 100644
--- a/modelscope/models/multi_modal/mmr/models/__init__.py
+++ b/modelscope/models/multi_modal/mmr/models/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .clip_for_mm_video_embedding import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 8d13e745..5e8e2e7a 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import random
 from os.path import exists
 from typing import Any, Dict
diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
index 572f44bc..253a847c 100644
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import numpy as np
 
 
diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py
index 21cc4c80..dc6510bf 100644
--- a/modelscope/models/multi_modal/mmr/models/modeling.py
+++ b/modelscope/models/multi_modal/mmr/models/modeling.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os
 import platform
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py
index 36e56196..53501720 100644
--- a/modelscope/models/multi_modal/mmr/models/module_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/module_clip.py
@@ -1,4 +1,5 @@
-# Part of the implementation is borrowed and modified from The OpenAI CLIP project.
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import hashlib
 import os
diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py
index 05edb853..b958d5bc 100644
--- a/modelscope/models/multi_modal/mmr/models/module_cross.py
+++ b/modelscope/models/multi_modal/mmr/models/module_cross.py
@@ -1,3 +1,6 @@
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 from __future__ import absolute_import, division, print_function
 import logging
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
index ee60f857..4e2c9b15 100644
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
index 54e971f7..98727066 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
@@ -1,4 +1,5 @@
-# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
index 17daedaf..eb52a48b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
index a4fc52e0..9677d7c4 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
@@ -1,5 +1,6 @@
-# The implementation here is modified based on latent diffusion, publicly available
-# at https://github.com/CompVis/latent-diffusion.
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
index c2d83b34..59bd837d 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 import os.path as osp
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
index 380fa467..9f4ef2d5 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
index 6fd9bebe..59d6b304 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
@@ -1,4 +1,5 @@
-# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
index 4e99a514..a292edae 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
index 8a0b3ff1..133da50b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
@@ -1,5 +1,6 @@
-# The implementation here is modified based on HuggingFace XGLM, publicly available
-# at https://github.com/huggingface/transformers.
+# Part of the implementation is borrowed and modified from HuggingFace XGLM,
+# publicly avaialbe at https://github.com/huggingface/transformers.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py
index 4aa77e17..8c0e288a 100644
--- a/modelscope/models/multi_modal/team/team_model.py
+++ b/modelscope/models/multi_modal/team/team_model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
 import cv2
diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py
index 3b3e394e..73919179 100644
--- a/modelscope/models/multi_modal/team/utils.py
+++ b/modelscope/models/multi_modal/team/utils.py
@@ -1,7 +1,10 @@
-""" Generative Multimodal Model
-Base Transformer code is adapted from https://github.com/openai/CLIP/,
-originally MIT License, Copyright (c) 2021 OpenAI,
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+
 from collections import OrderedDict
 from typing import Tuple, Union
 
diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py
index 18cba92c..fad14680 100644
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
index 9f4e2d93..deb17561 100644
--- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
+++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py
index 9ba5117b..07222086 100644
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/live_category_pipeline.py b/modelscope/pipelines/cv/live_category_pipeline.py
index c16ba6ba..715998cc 100644
--- a/modelscope/pipelines/cv/live_category_pipeline.py
+++ b/modelscope/pipelines/cv/live_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
index b7fd90b4..d08058c3 100644
--- a/modelscope/pipelines/cv/shop_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
index 0985b835..c7f9d4c2 100644
--- a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py
index 196d3115..e4c73649 100644
--- a/modelscope/pipelines/cv/video_category_pipeline.py
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
index d3b9fef3..13032314 100644
--- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
index fc123e2f..cafd6555 100644
--- a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -1,5 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 68eae385..5807c075 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # !/usr/bin/env python
 import unittest
 
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index 9232ebd4..7061d736 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import unittest
 
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
index 192602b4..a54fbcf0 100644
--- a/tests/pipelines/test_multi_modal_similarity.py
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import unittest
 

From af3ae447692c465827a6dd1c944aece5fdaa5405 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 27 Sep 2022 17:39:10 +0800
Subject: [PATCH 161/175] [to #44902165] bump version to 0.4.6

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 68eb9b68..ab45471d 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.5'
+__version__ = '0.4.6'

From 97ffa3a8d22ab0680eaee1ec574bd626b6452a91 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 30 Sep 2022 10:14:53 +0800
Subject: [PATCH 162/175] [to #44902165] update easycv version

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10308804
---
 requirements/cv.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cv.txt b/requirements/cv.txt
index 8c06242a..5a2d7763 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,7 +14,7 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.6.3.4
+pai-easycv>=0.6.3.6
 pandas
 psutil
 regex

From 8be4848de54ba3272641560924b18839926f8d69 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 30 Sep 2022 10:17:36 +0800
Subject: [PATCH 163/175] [to #44902165] bump version to 0.4.7

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index ab45471d..1e4826d6 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.6'
+__version__ = '0.4.7'

From 9fa761d7a6e26608fc4adfd54a58393e10dc0ca1 Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:10:36 +0800
Subject: [PATCH 164/175] [to #42322933] add PST action recognition model

Add patch shift transformer model for action recognition task.
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10282964
---
 modelscope/metainfo.py                        |    1 +
 .../models/cv/action_recognition/__init__.py  |    2 +
 .../temporal_patch_shift_transformer.py       | 1198 +++++++++++++++++
 .../cv/action_recognition_pipeline.py         |   54 +-
 tests/pipelines/test_action_recognition.py    |    8 +
 5 files changed, 1262 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index a1cf5e06..17b1dc40 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -179,6 +179,7 @@ class Pipelines(object):
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
     video_inpainting = 'video-inpainting'
+    pst_action_recognition = 'patchshift-action-recognition'
     hand_static = 'hand-static'
 
     # nlp tasks
diff --git a/modelscope/models/cv/action_recognition/__init__.py b/modelscope/models/cv/action_recognition/__init__.py
index 7bdee0cd..5e9dc310 100644
--- a/modelscope/models/cv/action_recognition/__init__.py
+++ b/modelscope/models/cv/action_recognition/__init__.py
@@ -7,11 +7,13 @@ if TYPE_CHECKING:
 
     from .models import BaseVideoModel
     from .tada_convnext import TadaConvNeXt
+    from .temporal_patch_shift_transformer import PatchShiftTransformer
 
 else:
     _import_structure = {
         'models': ['BaseVideoModel'],
         'tada_convnext': ['TadaConvNeXt'],
+        'temporal_patch_shift_transformer': ['PatchShiftTransformer']
     }
 
     import sys
diff --git a/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py b/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
new file mode 100644
index 00000000..46596afd
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
@@ -0,0 +1,1198 @@
+# Part of the implementation is borrowed and modified from Video Swin Transformer,
+# publicly available at https://github.com/SwinTransformer/Video-Swin-Transformer
+
+from abc import ABCMeta, abstractmethod
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import torchvision.transforms as T
+from einops import rearrange
+from timm.models.layers import DropPath, Mlp, trunc_normal_
+
+from modelscope.models import TorchModel
+
+
+def normal_init(module, mean=0., std=1., bias=0.):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def window_partition(x, window_size):
+    """ window_partition function.
+    Args:
+        x: (B, D, H, W, C)
+        window_size (tuple[int]): window size
+
+    Returns:
+        windows: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1],
+               window_size[1], W // window_size[2], window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6,
+                        7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """ window_reverse function.
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1],
+                     W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    The coordinates of patches and patches are shifted together using Pattern C.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        shift (bool, optional): If True, conduct shift operation
+        shift_type (str, optional): shift operation type, either using 'psm' or 'tsm'
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 shift=False,
+                 shift_type='psm'):
+
+        super().__init__()
+        self.dim = dim
+        window_size = (16, 7, 7)
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.shift = shift
+        self.shift_type = shift_type
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                np.prod([2 * ws - 1 for ws in window_size]),
+                num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(
+            torch.meshgrid(coords_d, coords_h, coords_w,
+                           indexing='ij'))  # 3, Wd, Wh, Ww
+        # Do the same rotation to coords
+        coords_old = coords.clone()
+
+        # pattern patternC - 9
+        coords[:, :, 0::3, 0::3] = torch.roll(
+            coords[:, :, 0::3, 0::3], shifts=-4, dims=1)
+        coords[:, :, 0::3, 1::3] = torch.roll(
+            coords[:, :, 0::3, 1::3], shifts=1, dims=1)
+        coords[:, :, 0::3, 2::3] = torch.roll(
+            coords[:, :, 0::3, 2::3], shifts=2, dims=1)
+        coords[:, :, 1::3, 2::3] = torch.roll(
+            coords[:, :, 1::3, 2::3], shifts=3, dims=1)
+        coords[:, :, 1::3, 0::3] = torch.roll(
+            coords[:, :, 1::3, 0::3], shifts=-1, dims=1)
+        coords[:, :, 2::3, 0::3] = torch.roll(
+            coords[:, :, 2::3, 0::3], shifts=-2, dims=1)
+        coords[:, :, 2::3, 1::3] = torch.roll(
+            coords[:, :, 2::3, 1::3], shifts=-3, dims=1)
+        coords[:, :, 2::3, 2::3] = torch.roll(
+            coords[:, :, 2::3, 2::3], shifts=4, dims=1)
+
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        coords_old_flatten = torch.flatten(coords_old, 1)
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords_old = coords_old_flatten[:, :,
+                                                 None] - coords_old_flatten[:,
+                                                                            None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords_old = relative_coords_old.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords_old[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords_old[:, :, 1] += self.window_size[1] - 1
+        relative_coords_old[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1]
+                                     - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+
+        relative_coords_old[:, :, 0] *= (2 * self.window_size[1]
+                                         - 1) * (2 * self.window_size[2] - 1)
+        relative_coords_old[:, :, 1] *= (2 * self.window_size[2] - 1)
+
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+
+        relative_position_index_old = relative_coords_old.sum(-1)
+        relative_position_index = relative_position_index.view(
+            window_size[0], window_size[1] * window_size[2], window_size[0],
+            window_size[1] * window_size[2]).permute(0, 2, 1, 3).reshape(
+                window_size[0] * window_size[0],
+                window_size[1] * window_size[2],
+                window_size[1] * window_size[2])[::window_size[0], :, :]
+
+        relative_position_index_old = relative_position_index_old.view(
+            window_size[0], window_size[1] * window_size[2], window_size[0],
+            window_size[1] * window_size[2]).permute(0, 2, 1, 3).reshape(
+                window_size[0] * window_size[0],
+                window_size[1] * window_size[2],
+                window_size[1] * window_size[2])[::window_size[0], :, :]
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+        self.register_buffer('relative_position_index_old',
+                             relative_position_index_old)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+        if self.shift and self.shift_type == 'psm':
+            self.shift_op = PatchShift(False, 1)
+            self.shift_op_back = PatchShift(True, 1)
+        elif self.shift and self.shift_type == 'tsm':
+            self.shift_op = TemporalShift(8)
+
+    def forward(self, x, mask=None, batch_size=8, frame_len=8):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        if self.shift:
+            x = x.view(B_, N, self.num_heads,
+                       C // self.num_heads).permute(0, 2, 1, 3)
+
+            x = self.shift_op(x, batch_size, frame_len)
+            x = x.permute(0, 2, 1, 3).reshape(B_, N, C)
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        if self.shift and self.shift_type == 'psm':
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index[:].reshape(-1), :].reshape(
+                    frame_len, N, N, -1)  # 8frames ,Wd*Wh*Ww,Wd*Wh*Ww,nH
+        else:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index_old[:].reshape(-1), :].reshape(
+                    frame_len, N, N, -1)  # 8frames ,Wd*Wh*Ww,Wd*Wh*Ww,nH
+
+        relative_position_bias = relative_position_bias.permute(
+            0, 3, 1, 2).contiguous()  # Frames, nH, Wd*Wh*Ww, Wd*Wh*Ww
+
+        attn = attn.view(
+            batch_size, frame_len, -1, self.num_heads, N, N).permute(
+                0,
+                2, 1, 3, 4, 5) + relative_position_bias.unsqueeze(0).unsqueeze(
+                    1)  # B_, nH, N, N
+        attn = attn.permute(0, 2, 1, 3, 4, 5).view(-1, self.num_heads, N, N)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+        # Shift back for psm
+        if self.shift and self.shift_type == 'psm':
+            x = self.shift_op_back(attn @ v, batch_size,
+                                   frame_len).transpose(1,
+                                                        2).reshape(B_, N, C)
+        else:
+            x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class PatchShift(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    The patches are shifted using Pattern C.
+
+    It supports both of shifted and shift back.
+
+    Args:
+        inv (bool): whether using inverse shifted (shift back)
+        ratio (float): ratio of channels to be shifted, patch shift using 1.0
+    """
+
+    def __init__(self, inv=False, ratio=1):
+        super(PatchShift, self).__init__()
+        self.inv = inv
+        self.ratio = ratio
+        # if inv:
+        # print('=> Using inverse PatchShift, ratio {}, tps'.format(ratio))
+        # else:
+        # print('=> Using bayershift, ratio {}, tps'.format(ratio))
+
+    def forward(self, x, batch_size, frame_len):
+        x = self.shift(
+            x,
+            inv=self.inv,
+            ratio=self.ratio,
+            batch_size=batch_size,
+            frame_len=frame_len)
+        return x
+
+    @staticmethod
+    def shift(x, inv=False, ratio=0.5, batch_size=8, frame_len=8):
+        B, num_heads, N, c = x.size()
+        fold = int(num_heads * ratio)
+        feat = x
+        feat = feat.view(batch_size, frame_len, -1, num_heads, 7, 7, c)
+        out = feat.clone()
+        multiplier = 1
+        stride = 1
+        if inv:
+            multiplier = -1
+
+        # Pattern C
+        out[:, :, :, :fold, 0::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 0::3, :],
+            shifts=-4 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 0::3, 1::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 1::3, :],
+            shifts=multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 1::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 1::3, 0::3, :],
+            shifts=-multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 0::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 2::3, :],
+            shifts=2 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 0::3, :],
+            shifts=-2 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 1::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 1::3, 2::3, :],
+            shifts=3 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 1::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 1::3, :],
+            shifts=-3 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 2::3, :],
+            shifts=4 * multiplier * stride,
+            dims=1)
+
+        out = out.view(B, num_heads, N, c)
+        return out
+
+
+class TemporalShift(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    The temporal channel shift.
+
+    The code is adopted from TSM: Temporal Shift Module for Efficient Video Understanding. ICCV19
+
+    https://github.com/mit-han-lab/temporal-shift-module/blob/master/ops/temporal_shift.py
+
+    Args:
+        n_div (int): propotion of channel to be shifted.
+    """
+
+    def __init__(self, n_div=8):
+        super(TemporalShift, self).__init__()
+        self.fold_div = n_div
+
+    def forward(self, x, batch_size, frame_len):
+        x = self.shift(
+            x,
+            fold_div=self.fold_div,
+            batch_size=batch_size,
+            frame_len=frame_len)
+        return x
+
+    @staticmethod
+    def shift(x, fold_div=8, batch_size=8, frame_len=8):
+        B, num_heads, N, c = x.size()
+        fold = c // fold_div
+        feat = x
+        feat = feat.view(batch_size, frame_len, -1, num_heads, N, c)
+        out = feat.clone()
+
+        out[:, 1:, :, :, :, :fold] = feat[:, :-1, :, :, :, :fold]  # shift left
+        out[:, :-1, :, :, :,
+            fold:2 * fold] = feat[:, 1:, :, :, :, fold:2 * fold]  # shift right
+
+        out = out.view(B, num_heads, N, c)
+
+        return out
+
+
+class SwinTransformerBlock3D(nn.Module):
+    """ Swin Transformer Block from Video Swin Transformer.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False,
+                 shift=False,
+                 shift_type='psm'):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        self.shift = shift
+        self.shift_type = shift_type
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            shift=self.shift,
+            shift_type=self.shift_type)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(
+                x,
+                shifts=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask, batch_size=B,
+            frame_len=D)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C, )))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(
+                shifted_x,
+                shifts=(shift_size[0], shift_size[1], shift_size[2]),
+                dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer from Video Swin Transformer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size, device):
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                      float(-100.0)).masked_fill(
+                                          attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage from Video Swin Transformer.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 shift_type='psm'):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        self.shift_type = shift_type
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+                shift=True,
+                shift_type='tsm' if (i % 2 == 0 and self.shift_type == 'psm')
+                or self.shift_type == 'tsm' else 'psm',
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.view(B, D, H, W, -1)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b c d h w')
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """ Video to Patch Embedding from Video Swin Transformer.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x,
+                (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer2D_TPS(nn.Module):
+    """
+        Code is adopted from Video Swin Transformer.
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 pretrained2d=True,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint,
+                shift_type='psm')
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def inflate_weights(self):
+        """Inflate the swin2d parameters to swin3d.
+
+        The differences between swin3d and swin2d mainly lie in an extra
+        axis. To utilize the pretrained parameters in 2d model,
+        the weight of swin2d models should be inflated to fit in the shapes of
+        the 3d counterpart.
+
+        Args:
+            logger (logging.Logger): The logger used to print
+                debugging infomation.
+        """
+        checkpoint = torch.load(self.pretrained, map_location='cpu')
+        state_dict = checkpoint['model']
+
+        # delete relative_position_index since we always re-init it
+        relative_position_index_keys = [
+            k for k in state_dict.keys() if 'relative_position_index' in k
+        ]
+        for k in relative_position_index_keys:
+            del state_dict[k]
+
+        # delete attn_mask since we always re-init it
+        attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+        for k in attn_mask_keys:
+            del state_dict[k]
+
+        state_dict['patch_embed.proj.weight'] = state_dict[
+            'patch_embed.proj.weight'].unsqueeze(2).repeat(
+                1, 1, self.patch_size[0], 1, 1) / self.patch_size[0]
+
+        # bicubic interpolate relative_position_bias_table if not match
+        relative_position_bias_table_keys = [
+            k for k in state_dict.keys() if 'relative_position_bias_table' in k
+        ]
+        for k in relative_position_bias_table_keys:
+            relative_position_bias_table_pretrained = state_dict[k]
+            relative_position_bias_table_current = self.state_dict()[k]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            # wd = self.window_size[0]
+            # to make it match
+            wd = 16
+            if nH1 != nH2:
+                print(f'Error in loading {k}, passing')
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.permute(
+                            1, 0).view(1, nH1, S1, S1),
+                        size=(2 * self.window_size[1] - 1,
+                              2 * self.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0)
+            state_dict[k] = relative_position_bias_table_pretrained.repeat(
+                2 * wd - 1, 1)
+
+        msg = self.load_state_dict(state_dict, strict=False)
+        print(msg)
+        print(f"=> loaded successfully '{self.pretrained}'")
+        del checkpoint
+        torch.cuda.empty_cache()
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        if pretrained:
+            self.pretrained = pretrained
+        if isinstance(self.pretrained, str):
+            self.apply(_init_weights)
+            print(f'load model from: {self.pretrained}')
+
+            if self.pretrained2d:
+                # Inflate 2D model into 3D model.
+                # self.inflate_weights(logger)
+                self.inflate_weights()
+            else:
+                # Directly load 3D model.
+                torch.load_checkpoint(self, self.pretrained, strict=False)
+        elif self.pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x.contiguous())
+
+        x = rearrange(x, 'n c d h w -> n d h w c')
+        x = self.norm(x)
+        x = rearrange(x, 'n d h w c -> n c d h w')
+
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer2D_TPS, self).train(mode)
+        self._freeze_stages()
+
+
+def top_k_accuracy(scores, labels, topk=(1, )):
+    """Calculate top k accuracy score from mmaction.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int]): Ground truth labels.
+        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
+
+    Returns:
+        list[float]: Top k accuracy score for each k.
+    """
+    res = []
+    labels = np.array(labels)[:, np.newaxis]
+    for k in topk:
+        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
+        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
+        topk_acc_score = match_array.sum() / match_array.shape[0]
+        res.append(topk_acc_score)
+
+    return res
+
+
+class BaseHead(nn.Module, metaclass=ABCMeta):
+    """Base class for head from mmaction.
+
+    All Head should subclass it.
+    All subclass should overwrite:
+    - Methods:``init_weights``, initializing weights in some modules.
+    - Methods:``forward``, supporting to forward both for training and testing.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss', loss_weight=1.0).
+        multi_class (bool): Determines whether it is a multi-class
+            recognition task. Default: False.
+        label_smooth_eps (float): Epsilon used in label smooth.
+            Reference: arxiv.org/abs/1906.02629. Default: 0.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cls=dict(type='CrossEntropyLoss', loss_weight=1.0),
+                 multi_class=False,
+                 label_smooth_eps=0.0):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_cls = torch.nn.CrossEntropyLoss()
+        self.multi_class = multi_class
+        self.label_smooth_eps = label_smooth_eps
+
+    @abstractmethod
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+
+    @abstractmethod
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+
+    def loss(self, cls_score, labels, **kwargs):
+        """Calculate the loss given output ``cls_score``, target ``labels``.
+
+        Args:
+            cls_score (torch.Tensor): The output of the model.
+            labels (torch.Tensor): The target output of the model.
+
+        Returns:
+            dict: A dict containing field 'loss_cls'(mandatory)
+            and 'top1_acc', 'top5_acc'(optional).
+        """
+        losses = dict()
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+                and cls_score.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_socre` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        if not self.multi_class and cls_score.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_score.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(), (1, 5))
+            losses['top1_acc'] = torch.tensor(
+                top_k_acc[0], device=cls_score.device)
+            losses['top5_acc'] = torch.tensor(
+                top_k_acc[1], device=cls_score.device)
+
+        elif self.multi_class and self.label_smooth_eps != 0:
+            labels = ((1 - self.label_smooth_eps) * labels
+                      + self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_score, labels, **kwargs)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+
+        return losses
+
+
+class I3DHead(BaseHead):
+    """Classification head for I3D from mmaction.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cls=dict(type='CrossEntropyLoss'),
+                 spatial_type='avg',
+                 dropout_ratio=0.5,
+                 init_std=0.01,
+                 **kwargs):
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self):
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, 4, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N, in_channels, 1, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels, 1, 1, 1]
+        x = x.view(x.shape[0], -1)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
+
+
+class PatchShiftTransformer(TorchModel):
+    """  This is PyTorch impl of PST:
+    Spatiotemporal Self-attention Modeling with Temporal Patch Shift for Action Recognition, ECCV22.
+    """
+
+    def __init__(self,
+                 model_dir=None,
+                 num_classes=400,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 embed_dim=96,
+                 in_channels=768,
+                 pretrained=None):
+        super().__init__(model_dir)
+        self.backbone = SwinTransformer2D_TPS(
+            pretrained=pretrained,
+            pretrained2d=True,
+            patch_size=(2, 4, 4),
+            in_chans=3,
+            embed_dim=embed_dim,
+            depths=depths,
+            num_heads=num_heads,
+            window_size=(1, 7, 7),
+            mlp_ratio=4.,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.2,
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            frozen_stages=-1,
+            use_checkpoint=False)
+        self.cls_head = I3DHead(
+            num_classes=num_classes, in_channels=in_channels)
+
+    def forward(self, x):
+        feature = self.backbone(x)
+        output = self.cls_head(feature)
+        return output
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index 7f1a46b2..993a32f0 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -7,7 +7,8 @@ from typing import Any, Dict
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.action_recognition import BaseVideoModel
+from modelscope.models.cv.action_recognition import (BaseVideoModel,
+                                                     PatchShiftTransformer)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -69,3 +70,54 @@ class ActionRecognitionPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
+
+
+@PIPELINES.register_module(
+    Tasks.action_recognition, module_name=Pipelines.pst_action_recognition)
+class PSTActionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a PST action recognition pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = PatchShiftTransformer(model).to(self.device)
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device)['state_dict'])
+        self.label_mapping = self.cfg.label_mapping
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(self.cfg, input).to(self.device)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        pred = self.perform_inference(input['video_data'])
+        output_label = self.label_mapping[str(pred)]
+        return {OutputKeys.LABELS: output_label}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz]))
+        pred = torch.cat(preds_list, dim=0)
+        return pred.mean(dim=0).argmax().item()
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
index b9548630..292eb238 100644
--- a/tests/pipelines/test_action_recognition.py
+++ b/tests/pipelines/test_action_recognition.py
@@ -29,6 +29,14 @@ class ActionRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
         print(f'recognition output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pst(self):
+        pst_recognition_pipeline = pipeline(
+            self.task, model='damo/cv_pathshift_action-recognition')
+        result = pst_recognition_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+        print('pst recognition results:', result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_demo_compatibility(self):
         self.compatibility_check()

From c31914652a7b07c9ffc04e0b2177afaac6f27eb4 Mon Sep 17 00:00:00 2001
From: "chaojie.mcj" <chaojie.mcj@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:17:02 +0800
Subject: [PATCH 165/175] [to #42322933]fix some offline model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

针对两个已经下线的模型补充license
modelscope/models/cv/image_to_image_generation
modelscope/models/cv/image_to_image_translation
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10304412
---
 modelscope/models/cv/image_to_image_generation/model.py        | 1 +
 .../models/cv/image_to_image_generation/models/autoencoder.py  | 1 +
 modelscope/models/cv/image_to_image_generation/models/clip.py  | 2 ++
 .../models/cv/image_to_image_generation/ops/diffusion.py       | 1 +
 modelscope/models/cv/image_to_image_generation/ops/losses.py   | 1 +
 .../models/cv/image_to_image_translation/data/transforms.py    | 1 +
 .../models/cv/image_to_image_translation/model_translation.py  | 1 +
 .../models/cv/image_to_image_translation/models/autoencoder.py | 1 +
 modelscope/models/cv/image_to_image_translation/models/clip.py | 2 ++
 modelscope/models/cv/image_to_image_translation/ops/apps.py    | 1 +
 .../models/cv/image_to_image_translation/ops/degradation.py    | 1 +
 .../models/cv/image_to_image_translation/ops/diffusion.py      | 3 +++
 modelscope/models/cv/image_to_image_translation/ops/losses.py  | 1 +
 modelscope/models/cv/image_to_image_translation/ops/metrics.py | 1 +
 .../models/cv/image_to_image_translation/ops/random_color.py   | 1 +
 .../models/cv/image_to_image_translation/ops/random_mask.py    | 1 +
 modelscope/models/cv/image_to_image_translation/ops/svd.py     | 1 +
 modelscope/models/cv/image_to_image_translation/ops/utils.py   | 1 +
 modelscope/pipelines/cv/video_inpainting_pipeline.py           | 1 +
 19 files changed, 23 insertions(+)

diff --git a/modelscope/models/cv/image_to_image_generation/model.py b/modelscope/models/cv/image_to_image_generation/model.py
index 37479b43..94e5dd7b 100644
--- a/modelscope/models/cv/image_to_image_generation/model.py
+++ b/modelscope/models/cv/image_to_image_generation/model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/models/autoencoder.py b/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
index 181472de..dce256f6 100644
--- a/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/models/clip.py b/modelscope/models/cv/image_to_image_generation/models/clip.py
index 35d9d882..d3dd22b4 100644
--- a/modelscope/models/cv/image_to_image_generation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_generation/models/clip.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/ops/diffusion.py b/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
index bcbb6402..b8ffbbbb 100644
--- a/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/ops/losses.py b/modelscope/models/cv/image_to_image_generation/ops/losses.py
index 23e8d246..46b9540a 100644
--- a/modelscope/models/cv/image_to_image_generation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/losses.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/data/transforms.py b/modelscope/models/cv/image_to_image_translation/data/transforms.py
index 5376d813..29a25b4b 100644
--- a/modelscope/models/cv/image_to_image_translation/data/transforms.py
+++ b/modelscope/models/cv/image_to_image_translation/data/transforms.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import random
 
diff --git a/modelscope/models/cv/image_to_image_translation/model_translation.py b/modelscope/models/cv/image_to_image_translation/model_translation.py
index 722b175d..f2a9e7db 100644
--- a/modelscope/models/cv/image_to_image_translation/model_translation.py
+++ b/modelscope/models/cv/image_to_image_translation/model_translation.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
index 181472de..dce256f6 100644
--- a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/models/clip.py b/modelscope/models/cv/image_to_image_translation/models/clip.py
index 35d9d882..d3dd22b4 100644
--- a/modelscope/models/cv/image_to_image_translation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_translation/models/clip.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/apps.py b/modelscope/models/cv/image_to_image_translation/ops/apps.py
index ee4be489..39d2e015 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/apps.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/apps.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # APPs that facilitate the use of pretrained neural networks.
 
 import os.path as osp
diff --git a/modelscope/models/cv/image_to_image_translation/ops/degradation.py b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
index c3b3d1df..9061e7be 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/degradation.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import os
 import random
diff --git a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
index bcbb6402..5ff37dc3 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/losses.py b/modelscope/models/cv/image_to_image_translation/ops/losses.py
index 23e8d246..46b9540a 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/losses.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/metrics.py b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
index 4a63c51f..c1023fa0 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/metrics.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import numpy as np
 import scipy.linalg as linalg
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/random_color.py b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
index 97e2f848..75692836 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/random_color.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import colorsys
 import random
 
diff --git a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
index a6b55916..bda1ec11 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import cv2
 import numpy as np
 
diff --git a/modelscope/models/cv/image_to_image_translation/ops/svd.py b/modelscope/models/cv/image_to_image_translation/ops/svd.py
index c5173de1..96f7e825 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/svd.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/svd.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 r"""SVD of linear degradation matrices described in the paper
     ``Denoising Diffusion Restoration Models.''
     @article{kawar2022denoising,
diff --git a/modelscope/models/cv/image_to_image_translation/ops/utils.py b/modelscope/models/cv/image_to_image_translation/ops/utils.py
index 3e523f4c..c2aacedc 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/utils.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import base64
 import binascii
 import hashlib
diff --git a/modelscope/pipelines/cv/video_inpainting_pipeline.py b/modelscope/pipelines/cv/video_inpainting_pipeline.py
index 15444e05..85133474 100644
--- a/modelscope/pipelines/cv/video_inpainting_pipeline.py
+++ b/modelscope/pipelines/cv/video_inpainting_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines

From d1016204454402d4b3c8931f0575ace874d1bce4 Mon Sep 17 00:00:00 2001
From: "jianqiang.rjq" <jianqiang.rjq@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:17:32 +0800
Subject: [PATCH 166/175] add header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10305074

    * add header
---
 modelscope/pipelines/cv/image_style_transfer_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/pipelines/cv/image_style_transfer_pipeline.py b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
index 64e67115..e5fd0d48 100644
--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 

From 00a078c9b524fe8da785ae789fb7c6b5990f6866 Mon Sep 17 00:00:00 2001
From: "lingchen.zlm" <lingchen.zlm@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:18:22 +0800
Subject: [PATCH 167/175] [to #42322933]fix gemm demo error when text input is
 empty         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10290715

---
 modelscope/models/multi_modal/gemm/gemm_model.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/multi_modal/gemm/gemm_model.py b/modelscope/models/multi_modal/gemm/gemm_model.py
index 55b211c0..c90b35d4 100644
--- a/modelscope/models/multi_modal/gemm/gemm_model.py
+++ b/modelscope/models/multi_modal/gemm/gemm_model.py
@@ -67,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel):
         return img_tensor
 
     def parse_text(self, text_str):
-        if text_str is None:
+        if text_str is None or len(text_str) == 0:
             return None
         if isinstance(text_str, str):
             text_ids_tensor = self.gemm_model.tokenize(text_str)
@@ -79,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel):
         return text_ids_tensor.view(1, -1)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        image = self.parse_image(input.get('image', input.get('img', None)))
-        text = self.parse_text(input.get('text', input.get('txt', None)))
-        captioning = input.get('captioning', False) is True
+        image_input = input.get('image', input.get('img', None))
+        text_input = input.get('text', input.get('txt', None))
+        captioning_input = input.get('captioning', None)
+        image = self.parse_image(image_input)
+        text = self.parse_text(text_input)
+        captioning = captioning_input is True or text_input == ''
         out = self.gemm_model(image, text, captioning)
         output = {
             OutputKeys.IMG_EMBEDDING: out.get('image_feature', None),

From 29160fa9da92cc17e0bed683fe359bac70b7e602 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:38:29 +0800
Subject: [PATCH 168/175] [to #42322933]update copyright header for
 body-2d-keypoints         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10311587

---
 modelscope/models/cv/body_2d_keypoints/hrnet_v2.py    | 2 ++
 modelscope/models/cv/body_2d_keypoints/w48.py         | 2 ++
 modelscope/pipelines/cv/body_2d_keypoints_pipeline.py | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
index 1570c8cc..ebd69adb 100644
--- a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
+++ b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
@@ -1,3 +1,5 @@
+# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.
+
 import os
 
 import numpy as np
diff --git a/modelscope/models/cv/body_2d_keypoints/w48.py b/modelscope/models/cv/body_2d_keypoints/w48.py
index 7140f8fe..e0317991 100644
--- a/modelscope/models/cv/body_2d_keypoints/w48.py
+++ b/modelscope/models/cv/body_2d_keypoints/w48.py
@@ -1,3 +1,5 @@
+# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.
+
 cfg_128x128_15 = {
     'DATASET': {
         'TYPE': 'DAMO',
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index c6a05195..d6afbae4 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict, List, Union
 

From 6752d59fde36e80bde0ec82c3fae744f03957463 Mon Sep 17 00:00:00 2001
From: myf272609 <myf272609@alibaba-inc.com>
Date: Sat, 1 Oct 2022 11:44:17 +0800
Subject: [PATCH 169/175] [to #42322933] add license headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 增加卡通化相关license headers
2. 检测算法同学委托@岩一，增加相关license headers
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10315091
---
 modelscope/models/cv/cartoon/facelib/LK/lk.py               | 2 ++
 modelscope/models/cv/cartoon/facelib/config.py              | 2 ++
 modelscope/models/cv/cartoon/facelib/face_detector.py       | 2 ++
 modelscope/models/cv/cartoon/facelib/face_landmark.py       | 2 ++
 modelscope/models/cv/cartoon/facelib/facer.py               | 2 ++
 .../models/cv/cartoon/mtcnn_pytorch/src/align_trans.py      | 6 ++----
 .../models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py  | 6 +-----
 modelscope/models/cv/cartoon/utils.py                       | 2 ++
 modelscope/pipelines/cv/image_cartoon_pipeline.py           | 2 ++
 modelscope/pipelines/cv/image_detection_pipeline.py         | 2 ++
 modelscope/pipelines/cv/image_salient_detection_pipeline.py | 2 ++
 11 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/modelscope/models/cv/cartoon/facelib/LK/lk.py b/modelscope/models/cv/cartoon/facelib/LK/lk.py
index df05e3f9..6fd95ad6 100644
--- a/modelscope/models/cv/cartoon/facelib/LK/lk.py
+++ b/modelscope/models/cv/cartoon/facelib/LK/lk.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import numpy as np
 
 from modelscope.models.cv.cartoon.facelib.config import config as cfg
diff --git a/modelscope/models/cv/cartoon/facelib/config.py b/modelscope/models/cv/cartoon/facelib/config.py
index d795fdde..92b39db0 100644
--- a/modelscope/models/cv/cartoon/facelib/config.py
+++ b/modelscope/models/cv/cartoon/facelib/config.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import os
 
 import numpy as np
diff --git a/modelscope/models/cv/cartoon/facelib/face_detector.py b/modelscope/models/cv/cartoon/facelib/face_detector.py
index e5589719..fa36d662 100644
--- a/modelscope/models/cv/cartoon/facelib/face_detector.py
+++ b/modelscope/models/cv/cartoon/facelib/face_detector.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import time
 
 import cv2
diff --git a/modelscope/models/cv/cartoon/facelib/face_landmark.py b/modelscope/models/cv/cartoon/facelib/face_landmark.py
index 063d40c3..3b7cc1b9 100644
--- a/modelscope/models/cv/cartoon/facelib/face_landmark.py
+++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import cv2
 import numpy as np
 import tensorflow as tf
diff --git a/modelscope/models/cv/cartoon/facelib/facer.py b/modelscope/models/cv/cartoon/facelib/facer.py
index 62388ab9..c6f34e9c 100644
--- a/modelscope/models/cv/cartoon/facelib/facer.py
+++ b/modelscope/models/cv/cartoon/facelib/facer.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import time
 
 import cv2
diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
index baa3ba73..eb542042 100644
--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
@@ -1,7 +1,5 @@
-"""
-Created on Mon Apr 24 15:43:29 2017
-@author: zhaoy
-"""
+# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch
+
 import cv2
 import numpy as np
 
diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
index 96a5f965..ea9fbacf 100644
--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
@@ -1,8 +1,4 @@
-"""
-Created on Tue Jul 11 06:54:28 2017
-
-@author: zhaoyafei
-"""
+# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch
 
 import numpy as np
 from numpy.linalg import inv, lstsq
diff --git a/modelscope/models/cv/cartoon/utils.py b/modelscope/models/cv/cartoon/utils.py
index 39712653..59b4e879 100644
--- a/modelscope/models/cv/cartoon/utils.py
+++ b/modelscope/models/cv/cartoon/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 
 import cv2
diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index 787aa06d..8606915c 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py
index 8df10d45..f5554ca2 100644
--- a/modelscope/pipelines/cv/image_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
index 3b145cf0..4a3eaa65 100644
--- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines

From 4199af337e68b398ca6cc9b3107ff685bcb67b79 Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Sat, 1 Oct 2022 15:57:12 +0800
Subject: [PATCH 170/175] [to #42322933] add face-human-hand detection model   
      Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260332

---
 .../test/images/face_human_hand_detection.jpg |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/face_human_hand_detection/__init__.py  |  20 +
 .../cv/face_human_hand_detection/det_infer.py | 133 ++++++
 .../cv/face_human_hand_detection/ghost_pan.py | 395 ++++++++++++++++
 .../nanodet_plus_head.py                      | 427 ++++++++++++++++++
 .../one_stage_detector.py                     |  64 +++
 .../face_human_hand_detection/shufflenetv2.py | 182 ++++++++
 .../cv/face_human_hand_detection/utils.py     | 277 ++++++++++++
 modelscope/outputs.py                         |  11 +-
 modelscope/pipelines/builder.py               |   3 +
 .../cv/face_human_hand_detection_pipeline.py  |  42 ++
 modelscope/utils/constant.py                  |   1 +
 .../test_face_human_hand_detection.py         |  38 ++
 14 files changed, 1597 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/face_human_hand_detection.jpg
 create mode 100644 modelscope/models/cv/face_human_hand_detection/__init__.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/det_infer.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/ghost_pan.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
 create mode 100644 modelscope/models/cv/face_human_hand_detection/utils.py
 create mode 100644 modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
 create mode 100644 tests/pipelines/test_face_human_hand_detection.py

diff --git a/data/test/images/face_human_hand_detection.jpg b/data/test/images/face_human_hand_detection.jpg
new file mode 100644
index 00000000..f94bb547
--- /dev/null
+++ b/data/test/images/face_human_hand_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487
+size 23889
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 17b1dc40..54e09f7a 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -40,6 +40,7 @@ class Models(object):
     ulfd = 'ulfd'
     video_inpainting = 'video-inpainting'
     hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -181,6 +182,7 @@ class Pipelines(object):
     video_inpainting = 'video-inpainting'
     pst_action_recognition = 'patchshift-action-recognition'
     hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/face_human_hand_detection/__init__.py b/modelscope/models/cv/face_human_hand_detection/__init__.py
new file mode 100644
index 00000000..33a5fd2f
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .det_infer import NanoDetForFaceHumanHandDetection
+
+else:
+    _import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_human_hand_detection/det_infer.py b/modelscope/models/cv/face_human_hand_detection/det_infer.py
new file mode 100644
index 00000000..7a7225ee
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/det_infer.py
@@ -0,0 +1,133 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .one_stage_detector import OneStageDetector
+
+logger = get_logger()
+
+
+def load_model_weight(model_dir, device):
+    checkpoint = torch.load(
+        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+        map_location=device)
+    state_dict = checkpoint['state_dict'].copy()
+    for k in checkpoint['state_dict']:
+        if k.startswith('avg_model.'):
+            v = state_dict.pop(k)
+            state_dict[k[4:]] = v
+
+    return state_dict
+
+
+@MODELS.register_module(
+    Tasks.face_human_hand_detection,
+    module_name=Models.face_human_hand_detection)
+class NanoDetForFaceHumanHandDetection(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = OneStageDetector()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU ')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+
+        self.state_dict = load_model_weight(model_dir, self.device)
+        self.model.load_state_dict(self.state_dict, strict=False)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        pred_result = self.model.inference(x)
+        return pred_result
+
+
+def naive_collate(batch):
+    elem = batch[0]
+    if isinstance(elem, dict):
+        return {key: naive_collate([d[key] for d in batch]) for key in elem}
+    else:
+        return batch
+
+
+def get_resize_matrix(raw_shape, dst_shape):
+
+    r_w, r_h = raw_shape
+    d_w, d_h = dst_shape
+    Rs = np.eye(3)
+
+    Rs[0, 0] *= d_w / r_w
+    Rs[1, 1] *= d_h / r_h
+    return Rs
+
+
+def color_aug_and_norm(meta, mean, std):
+    img = meta['img'].astype(np.float32) / 255
+    mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255
+    std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255
+    img = (img - mean) / std
+    meta['img'] = img
+    return meta
+
+
+def img_process(meta, mean, std):
+    raw_img = meta['img']
+    height = raw_img.shape[0]
+    width = raw_img.shape[1]
+    dst_shape = [320, 320]
+    M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+    ResizeM = get_resize_matrix((width, height), dst_shape)
+    M = ResizeM @ M
+    img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape))
+    meta['img'] = img
+    meta['warp_matrix'] = M
+    meta = color_aug_and_norm(meta, mean, std)
+    return meta
+
+
+def overlay_bbox_cv(dets, class_names, score_thresh):
+    all_box = []
+    for label in dets:
+        for bbox in dets[label]:
+            score = bbox[-1]
+            if score > score_thresh:
+                x0, y0, x1, y1 = [int(i) for i in bbox[:4]]
+                all_box.append([label, x0, y0, x1, y1, score])
+    all_box.sort(key=lambda v: v[5])
+    return all_box
+
+
+mean = [103.53, 116.28, 123.675]
+std = [57.375, 57.12, 58.395]
+class_names = ['person', 'face', 'hand']
+
+
+def inference(model, device, img_path):
+    img_info = {'id': 0}
+    img = cv2.imread(img_path)
+    height, width = img.shape[:2]
+    img_info['height'] = height
+    img_info['width'] = width
+    meta = dict(img_info=img_info, raw_img=img, img=img)
+
+    meta = img_process(meta, mean, std)
+    meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device)
+    meta = naive_collate([meta])
+    meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320)
+    with torch.no_grad():
+        res = model(meta)
+    result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
+    return result
diff --git a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
new file mode 100644
index 00000000..e00de407
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
@@ -0,0 +1,395 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import math
+
+import torch
+import torch.nn as nn
+
+from .utils import ConvModule, DepthwiseConvModule, act_layers
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def hard_sigmoid(x, inplace: bool = False):
+    if inplace:
+        return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0)
+    else:
+        return F.relu6(x + 3.0) / 6.0
+
+
+class SqueezeExcite(nn.Module):
+
+    def __init__(self,
+                 in_chs,
+                 se_ratio=0.25,
+                 reduced_base_chs=None,
+                 activation='ReLU',
+                 gate_fn=hard_sigmoid,
+                 divisor=4,
+                 **_):
+        super(SqueezeExcite, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio,
+                                      divisor)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layers(activation)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = x * self.gate_fn(x_se)
+        return x
+
+
+class GhostModule(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 activation='ReLU'):
+        super(GhostModule, self).__init__()
+        self.oup = oup
+        init_channels = math.ceil(oup / ratio)
+        new_channels = init_channels * (ratio - 1)
+
+        self.primary_conv = nn.Sequential(
+            nn.Conv2d(
+                inp,
+                init_channels,
+                kernel_size,
+                stride,
+                kernel_size // 2,
+                bias=False),
+            nn.BatchNorm2d(init_channels),
+            act_layers(activation) if activation else nn.Sequential(),
+        )
+
+        self.cheap_operation = nn.Sequential(
+            nn.Conv2d(
+                init_channels,
+                new_channels,
+                dw_size,
+                1,
+                dw_size // 2,
+                groups=init_channels,
+                bias=False,
+            ),
+            nn.BatchNorm2d(new_channels),
+            act_layers(activation) if activation else nn.Sequential(),
+        )
+
+    def forward(self, x):
+        x1 = self.primary_conv(x)
+        x2 = self.cheap_operation(x1)
+        out = torch.cat([x1, x2], dim=1)
+        return out
+
+
+class GhostBottleneck(nn.Module):
+    """Ghost bottleneck w/ optional SE"""
+
+    def __init__(
+        self,
+        in_chs,
+        mid_chs,
+        out_chs,
+        dw_kernel_size=3,
+        stride=1,
+        activation='ReLU',
+        se_ratio=0.0,
+    ):
+        super(GhostBottleneck, self).__init__()
+        has_se = se_ratio is not None and se_ratio > 0.0
+        self.stride = stride
+
+        # Point-wise expansion
+        self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation)
+
+        # Depth-wise convolution
+        if self.stride > 1:
+            self.conv_dw = nn.Conv2d(
+                mid_chs,
+                mid_chs,
+                dw_kernel_size,
+                stride=stride,
+                padding=(dw_kernel_size - 1) // 2,
+                groups=mid_chs,
+                bias=False,
+            )
+            self.bn_dw = nn.BatchNorm2d(mid_chs)
+
+        if has_se:
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
+        else:
+            self.se = None
+
+        self.ghost2 = GhostModule(mid_chs, out_chs, activation=None)
+
+        if in_chs == out_chs and self.stride == 1:
+            self.shortcut = nn.Sequential()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_chs,
+                    in_chs,
+                    dw_kernel_size,
+                    stride=stride,
+                    padding=(dw_kernel_size - 1) // 2,
+                    groups=in_chs,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(in_chs),
+                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_chs),
+            )
+
+    def forward(self, x):
+        residual = x
+
+        x = self.ghost1(x)
+
+        if self.stride > 1:
+            x = self.conv_dw(x)
+            x = self.bn_dw(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        x = self.ghost2(x)
+
+        x += self.shortcut(residual)
+        return x
+
+
+class GhostBlocks(nn.Module):
+    """Stack of GhostBottleneck used in GhostPAN.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expand (int): Expand ratio of GhostBottleneck. Default: 1.
+        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
+        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        use_res (bool): Whether to use residual connection. Default: False.
+        activation (str): Name of activation function. Default: LeakyReLU.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        expand=1,
+        kernel_size=5,
+        num_blocks=1,
+        use_res=False,
+        activation='LeakyReLU',
+    ):
+        super(GhostBlocks, self).__init__()
+        self.use_res = use_res
+        if use_res:
+            self.reduce_conv = ConvModule(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                activation=activation,
+            )
+        blocks = []
+        for _ in range(num_blocks):
+            blocks.append(
+                GhostBottleneck(
+                    in_channels,
+                    int(out_channels * expand),
+                    out_channels,
+                    dw_kernel_size=kernel_size,
+                    activation=activation,
+                ))
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        out = self.blocks(x)
+        if self.use_res:
+            out = out + self.reduce_conv(x)
+        return out
+
+
+class GhostPAN(nn.Module):
+    """Path Aggregation Network with Ghost block.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
+        expand (int): Expand ratio of GhostBottleneck. Default: 1.
+        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        use_res (bool): Whether to use residual connection. Default: False.
+        num_extra_level (int): Number of extra conv layers for more feature levels.
+            Default: 0.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        activation (str): Activation layer name.
+            Default: LeakyReLU.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            use_depthwise=False,
+            kernel_size=5,
+            expand=1,
+            num_blocks=1,
+            use_res=False,
+            num_extra_level=0,
+            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
+            norm_cfg=dict(type='BN'),
+            activation='LeakyReLU',
+    ):
+        super(GhostPAN, self).__init__()
+        assert num_extra_level >= 0
+        assert num_blocks >= 1
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        for idx in range(len(in_channels)):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    out_channels,
+                    1,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.top_down_blocks.append(
+                GhostBlocks(
+                    out_channels * 2,
+                    out_channels,
+                    expand,
+                    kernel_size=kernel_size,
+                    num_blocks=num_blocks,
+                    use_res=use_res,
+                    activation=activation,
+                ))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+            self.bottom_up_blocks.append(
+                GhostBlocks(
+                    out_channels * 2,
+                    out_channels,
+                    expand,
+                    kernel_size=kernel_size,
+                    num_blocks=num_blocks,
+                    use_res=use_res,
+                    activation=activation,
+                ))
+
+        # extra layers
+        self.extra_lvl_in_conv = nn.ModuleList()
+        self.extra_lvl_out_conv = nn.ModuleList()
+        for i in range(num_extra_level):
+            self.extra_lvl_in_conv.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+            self.extra_lvl_out_conv.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+        Returns:
+            tuple[Tensor]: multi level features.
+        """
+        assert len(inputs) == len(self.in_channels)
+        inputs = [
+            reduce(input_x)
+            for input_x, reduce in zip(inputs, self.reduce_layers)
+        ]
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # extra layers
+        for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv,
+                                                   self.extra_lvl_out_conv):
+            outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1]))
+
+        return tuple(outs)
diff --git a/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py b/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
new file mode 100644
index 00000000..7f5b50ec
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
@@ -0,0 +1,427 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import math
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.ops import nms
+
+from .utils import ConvModule, DepthwiseConvModule
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+    Args:
+        reg_max (int): The maximal value of the discrete set. Default: 16. You
+            may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        shape = x.size()
+        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
+        x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4)
+        return x
+
+
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    """Performs non-maximum suppression in a batched fashion.
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+    Arguments:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+            Possible keys includes the following.
+            - iou_thr (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+                number of boxes is large (e.g., 200k). To avoid OOM during
+                training, the users could set `split_thr` to a small value.
+                If the number of boxes is greater than the threshold, it will
+                perform NMS on each group of boxes separately and sequentially.
+                Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class.
+    Returns:
+        tuple: kept dets and indice.
+    """
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None]
+    nms_cfg_.pop('type', 'nms')
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    if len(boxes_for_nms) < split_thr:
+        keep = nms(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+        scores = scores[keep]
+    else:
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+        keep = keep[scores[keep].argsort(descending=True)]
+        boxes = boxes[keep]
+        scores = scores[keep]
+
+    return torch.cat([boxes, scores[:, None]], -1), keep
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores[:, :-1]
+
+    valid_mask = scores > score_thr
+
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        return bboxes, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    return dets, labels[keep]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def warp_boxes(boxes, M, width, height):
+    n = len(boxes)
+    if n:
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)
+        xy = xy @ M.T
+        xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate(
+            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+        return xy.astype(np.float32)
+    else:
+        return boxes
+
+
+class NanoDetPlusHead(nn.Module):
+    """Detection head used in NanoDet-Plus.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss (dict): Loss config.
+        input_channel (int): Number of channels of the input feature.
+        feat_channels (int): Number of channels of the feature.
+            Default: 96.
+        stacked_convs (int): Number of conv layers in the stacked convs.
+            Default: 2.
+        kernel_size (int): Size of the convolving kernel. Default: 5.
+        strides (list[int]): Strides of input multi-level feature maps.
+            Default: [8, 16, 32].
+        conv_type (str): Type of the convolution.
+            Default: "DWConv".
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        reg_max (int): The maximal value of the discrete set. Default: 7.
+        activation (str): Type of activation function. Default: "LeakyReLU".
+        assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13).
+    """
+
+    def __init__(self,
+                 num_classes,
+                 input_channel,
+                 feat_channels=96,
+                 stacked_convs=2,
+                 kernel_size=5,
+                 strides=[8, 16, 32],
+                 conv_type='DWConv',
+                 norm_cfg=dict(type='BN'),
+                 reg_max=7,
+                 activation='LeakyReLU',
+                 assigner_cfg=dict(topk=13),
+                 **kwargs):
+        super(NanoDetPlusHead, self).__init__()
+        self.num_classes = num_classes
+        self.in_channels = input_channel
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.reg_max = reg_max
+        self.activation = activation
+        self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule
+
+        self.norm_cfg = norm_cfg
+        self.distribution_project = Integral(self.reg_max)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        self.cls_convs = nn.ModuleList()
+        for _ in self.strides:
+            cls_convs = self._buid_not_shared_head()
+            self.cls_convs.append(cls_convs)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels,
+                self.num_classes + 4 * (self.reg_max + 1),
+                1,
+                padding=0,
+            ) for _ in self.strides
+        ])
+
+    def _buid_not_shared_head(self):
+        cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            cls_convs.append(
+                self.ConvModule(
+                    chn,
+                    self.feat_channels,
+                    self.kernel_size,
+                    stride=1,
+                    padding=self.kernel_size // 2,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None,
+                    activation=self.activation,
+                ))
+        return cls_convs
+
+    def forward(self, feats):
+        if torch.onnx.is_in_onnx_export():
+            return self._forward_onnx(feats)
+        outputs = []
+        for feat, cls_convs, gfl_cls in zip(
+                feats,
+                self.cls_convs,
+                self.gfl_cls,
+        ):
+            for conv in cls_convs:
+                feat = conv(feat)
+            output = gfl_cls(feat)
+            outputs.append(output.flatten(start_dim=2))
+        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
+        return outputs
+
+    def post_process(self, preds, meta):
+        """Prediction results post processing. Decode bboxes and rescale
+        to original image size.
+        Args:
+            preds (Tensor): Prediction output.
+            meta (dict): Meta info.
+        """
+        cls_scores, bbox_preds = preds.split(
+            [self.num_classes, 4 * (self.reg_max + 1)], dim=-1)
+        result_list = self.get_bboxes(cls_scores, bbox_preds, meta)
+        det_results = {}
+        warp_matrixes = (
+            meta['warp_matrix']
+            if isinstance(meta['warp_matrix'], list) else meta['warp_matrix'])
+        img_heights = (
+            meta['img_info']['height'].cpu().numpy() if isinstance(
+                meta['img_info']['height'], torch.Tensor) else
+            meta['img_info']['height'])
+        img_widths = (
+            meta['img_info']['width'].cpu().numpy() if isinstance(
+                meta['img_info']['width'], torch.Tensor) else
+            meta['img_info']['width'])
+        img_ids = (
+            meta['img_info']['id'].cpu().numpy() if isinstance(
+                meta['img_info']['id'], torch.Tensor) else
+            meta['img_info']['id'])
+
+        for result, img_width, img_height, img_id, warp_matrix in zip(
+                result_list, img_widths, img_heights, img_ids, warp_matrixes):
+            det_result = {}
+            det_bboxes, det_labels = result
+            det_bboxes = det_bboxes.detach().cpu().numpy()
+            det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4],
+                                           np.linalg.inv(warp_matrix),
+                                           img_width, img_height)
+            classes = det_labels.detach().cpu().numpy()
+            for i in range(self.num_classes):
+                inds = classes == i
+                det_result[i] = np.concatenate(
+                    [
+                        det_bboxes[inds, :4].astype(np.float32),
+                        det_bboxes[inds, 4:5].astype(np.float32),
+                    ],
+                    axis=1,
+                ).tolist()
+            det_results[img_id] = det_result
+        return det_results
+
+    def get_bboxes(self, cls_preds, reg_preds, img_metas):
+        """Decode the outputs to bboxes.
+        Args:
+            cls_preds (Tensor): Shape (num_imgs, num_points, num_classes).
+            reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)).
+            img_metas (dict): Dict of image info.
+
+        Returns:
+            results_list (list[tuple]): List of detection bboxes and labels.
+        """
+        device = cls_preds.device
+        b = cls_preds.shape[0]
+        input_height, input_width = img_metas['img'].shape[2:]
+        input_shape = (input_height, input_width)
+
+        featmap_sizes = [(math.ceil(input_height / stride),
+                          math.ceil(input_width) / stride)
+                         for stride in self.strides]
+        mlvl_center_priors = [
+            self.get_single_level_center_priors(
+                b,
+                featmap_sizes[i],
+                stride,
+                dtype=torch.float32,
+                device=device,
+            ) for i, stride in enumerate(self.strides)
+        ]
+        center_priors = torch.cat(mlvl_center_priors, dim=1)
+        dis_preds = self.distribution_project(reg_preds) * center_priors[...,
+                                                                         2,
+                                                                         None]
+        bboxes = distance2bbox(
+            center_priors[..., :2], dis_preds, max_shape=input_shape)
+        scores = cls_preds.sigmoid()
+        result_list = []
+        for i in range(b):
+            score, bbox = scores[i], bboxes[i]
+            padding = score.new_zeros(score.shape[0], 1)
+            score = torch.cat([score, padding], dim=1)
+            results = multiclass_nms(
+                bbox,
+                score,
+                score_thr=0.05,
+                nms_cfg=dict(type='nms', iou_threshold=0.6),
+                max_num=100,
+            )
+            result_list.append(results)
+        return result_list
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+        """Generate centers of a single stage feature map.
+        Args:
+            batch_size (int): Number of images in one batch.
+            featmap_size (tuple[int]): height and width of the feature map
+            stride (int): down sample stride of the feature map
+            dtype (obj:`torch.dtype`): data type of the tensors
+            device (obj:`torch.device`): device of the tensors
+        Return:
+            priors (Tensor): center priors of a single level feature map.
+        """
+        h, w = featmap_size
+        x_range = (torch.arange(w, dtype=dtype, device=device)) * stride
+        y_range = (torch.arange(h, dtype=dtype, device=device)) * stride
+        y, x = torch.meshgrid(y_range, x_range)
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        proiors = torch.stack([x, y, strides, strides], dim=-1)
+        return proiors.unsqueeze(0).repeat(batch_size, 1, 1)
diff --git a/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
new file mode 100644
index 00000000..c1d0a52f
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
@@ -0,0 +1,64 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+from .ghost_pan import GhostPAN
+from .nanodet_plus_head import NanoDetPlusHead
+from .shufflenetv2 import ShuffleNetV2
+
+
+class OneStageDetector(nn.Module):
+
+    def __init__(self):
+        super(OneStageDetector, self).__init__()
+        self.backbone = ShuffleNetV2(
+            model_size='1.0x',
+            out_stages=(2, 3, 4),
+            with_last_conv=False,
+            kernal_size=3,
+            activation='LeakyReLU',
+            pretrain=False)
+        self.fpn = GhostPAN(
+            in_channels=[116, 232, 464],
+            out_channels=96,
+            use_depthwise=True,
+            kernel_size=5,
+            expand=1,
+            num_blocks=1,
+            use_res=False,
+            num_extra_level=1,
+            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
+            norm_cfg=dict(type='BN'),
+            activation='LeakyReLU')
+        self.head = NanoDetPlusHead(
+            num_classes=3,
+            input_channel=96,
+            feat_channels=96,
+            stacked_convs=2,
+            kernel_size=5,
+            strides=[8, 16, 32, 64],
+            conv_type='DWConv',
+            norm_cfg=dict(type='BN'),
+            reg_max=7,
+            activation='LeakyReLU',
+            assigner_cfg=dict(topk=13))
+        self.epoch = 0
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if hasattr(self, 'fpn'):
+            x = self.fpn(x)
+        if hasattr(self, 'head'):
+            x = self.head(x)
+        return x
+
+    def inference(self, meta):
+        with torch.no_grad():
+            torch.cuda.synchronize()
+            preds = self(meta['img'])
+            torch.cuda.synchronize()
+            results = self.head.post_process(preds, meta)
+            torch.cuda.synchronize()
+        return results
diff --git a/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py b/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
new file mode 100644
index 00000000..7f4dfc2a
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
@@ -0,0 +1,182 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+from .utils import act_layers
+
+
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class ShuffleV2Block(nn.Module):
+
+    def __init__(self, inp, oup, stride, activation='ReLU'):
+        super(ShuffleV2Block, self).__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError('illegal stride value')
+        self.stride = stride
+
+        branch_features = oup // 2
+        assert (self.stride != 1) or (inp == branch_features << 1)
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(
+                    inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(
+                    inp,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False),
+                nn.BatchNorm2d(branch_features),
+                act_layers(activation),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(
+                inp if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            act_layers(activation),
+            self.depthwise_conv(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+            ),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            act_layers(activation),
+        )
+
+    @staticmethod
+    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
+        return nn.Conv2d(
+            i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x):
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+
+    def __init__(
+        self,
+        model_size='1.5x',
+        out_stages=(2, 3, 4),
+        with_last_conv=False,
+        kernal_size=3,
+        activation='ReLU',
+        pretrain=True,
+    ):
+        super(ShuffleNetV2, self).__init__()
+        assert set(out_stages).issubset((2, 3, 4))
+
+        print('model size is ', model_size)
+
+        self.stage_repeats = [4, 8, 4]
+        self.model_size = model_size
+        self.out_stages = out_stages
+        self.with_last_conv = with_last_conv
+        self.kernal_size = kernal_size
+        self.activation = activation
+        if model_size == '0.5x':
+            self._stage_out_channels = [24, 48, 96, 192, 1024]
+        elif model_size == '1.0x':
+            self._stage_out_channels = [24, 116, 232, 464, 1024]
+        elif model_size == '1.5x':
+            self._stage_out_channels = [24, 176, 352, 704, 1024]
+        elif model_size == '2.0x':
+            self._stage_out_channels = [24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            act_layers(activation),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(
+                stage_names, self.stage_repeats, self._stage_out_channels[1:]):
+            seq = [
+                ShuffleV2Block(
+                    input_channels, output_channels, 2, activation=activation)
+            ]
+            for i in range(repeats - 1):
+                seq.append(
+                    ShuffleV2Block(
+                        output_channels,
+                        output_channels,
+                        1,
+                        activation=activation))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+        output_channels = self._stage_out_channels[-1]
+        if self.with_last_conv:
+            conv5 = nn.Sequential(
+                nn.Conv2d(
+                    input_channels, output_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(output_channels),
+                act_layers(activation),
+            )
+            self.stage4.add_module('conv5', conv5)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        output = []
+
+        for i in range(2, 5):
+            stage = getattr(self, 'stage{}'.format(i))
+            x = stage(x)
+            if i in self.out_stages:
+                output.append(x)
+        return tuple(output)
diff --git a/modelscope/models/cv/face_human_hand_detection/utils.py b/modelscope/models/cv/face_human_hand_detection/utils.py
new file mode 100644
index 00000000..f989c164
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/utils.py
@@ -0,0 +1,277 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+activations = {
+    'ReLU': nn.ReLU,
+    'LeakyReLU': nn.LeakyReLU,
+    'ReLU6': nn.ReLU6,
+    'SELU': nn.SELU,
+    'ELU': nn.ELU,
+    'GELU': nn.GELU,
+    'PReLU': nn.PReLU,
+    'SiLU': nn.SiLU,
+    'HardSwish': nn.Hardswish,
+    'Hardswish': nn.Hardswish,
+    None: nn.Identity,
+}
+
+
+def act_layers(name):
+    assert name in activations.keys()
+    if name == 'LeakyReLU':
+        return nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    elif name == 'GELU':
+        return nn.GELU()
+    elif name == 'PReLU':
+        return nn.PReLU()
+    else:
+        return activations[name](inplace=True)
+
+
+norm_cfg = {
+    'BN': ('bn', nn.BatchNorm2d),
+    'SyncBN': ('bn', nn.SyncBatchNorm),
+    'GN': ('gn', nn.GroupNorm),
+}
+
+
+def build_norm_layer(cfg, num_features, postfix=''):
+    """Build normalization layer
+
+    Args:
+        cfg (dict): cfg should contain:
+            type (str): identify norm layer type.
+            layer args: args needed to instantiate a norm layer.
+            requires_grad (bool): [optional] whether stop gradient updates
+        num_features (int): number of channels from input.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer.
+
+    Returns:
+        name (str): abbreviation + postfix
+        layer (nn.Module): created norm layer
+    """
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in norm_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    else:
+        abbr, norm_layer = norm_cfg[layer_type]
+        if norm_layer is None:
+            raise NotImplementedError
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+class ConvModule(nn.Module):
+    """A conv block that contains conv/norm/activation layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+        conv_cfg (dict): Config dict for convolution layer.
+        norm_cfg (dict): Config dict for normalization layer.
+        activation (str): activation layer, "ReLU" by default.
+        inplace (bool): Whether to use inplace mode for activation.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias='auto',
+            conv_cfg=None,
+            norm_cfg=None,
+            activation='ReLU',
+            inplace=True,
+            order=('conv', 'norm', 'act'),
+    ):
+        super(ConvModule, self).__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert activation is None or isinstance(activation, str)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.activation = activation
+        self.inplace = inplace
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        self.with_norm = norm_cfg is not None
+        if bias == 'auto':
+            bias = False if self.with_norm else True
+        self.with_bias = bias
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_norm:
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.add_module(self.norm_name, norm)
+        else:
+            self.norm_name = None
+
+        if self.activation:
+            self.act = act_layers(self.activation)
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def forward(self, x, norm=True):
+        for layer in self.order:
+            if layer == 'conv':
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and self.activation:
+                x = self.act(x)
+        return x
+
+
+class DepthwiseConvModule(nn.Module):
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias='auto',
+            norm_cfg=dict(type='BN'),
+            activation='ReLU',
+            inplace=True,
+            order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'),
+    ):
+        super(DepthwiseConvModule, self).__init__()
+        assert activation is None or isinstance(activation, str)
+        self.activation = activation
+        self.inplace = inplace
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 6
+        assert set(order) == {
+            'depthwise',
+            'dwnorm',
+            'act',
+            'pointwise',
+            'pwnorm',
+            'act',
+        }
+
+        self.with_norm = norm_cfg is not None
+        if bias == 'auto':
+            bias = False if self.with_norm else True
+        self.with_bias = bias
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias)
+
+        self.in_channels = self.depthwise.in_channels
+        self.out_channels = self.pointwise.out_channels
+        self.kernel_size = self.depthwise.kernel_size
+        self.stride = self.depthwise.stride
+        self.padding = self.depthwise.padding
+        self.dilation = self.depthwise.dilation
+        self.transposed = self.depthwise.transposed
+        self.output_padding = self.depthwise.output_padding
+
+        if self.with_norm:
+            _, self.dwnorm = build_norm_layer(norm_cfg, in_channels)
+            _, self.pwnorm = build_norm_layer(norm_cfg, out_channels)
+
+        if self.activation:
+            self.act = act_layers(self.activation)
+
+    def forward(self, x, norm=True):
+        for layer_name in self.order:
+            if layer_name != 'act':
+                layer = self.__getattr__(layer_name)
+                x = layer(x)
+            elif layer_name == 'act' and self.activation:
+                x = self.act(x)
+        return x
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 357afd07..52f3c47e 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -649,8 +649,17 @@ TASK_OUTPUTS = {
     #     'output': ['Done' / 'Decode_Error']
     # }
     Tasks.video_inpainting: [OutputKeys.OUTPUT],
+
     # {
     #     'output': ['bixin']
     # }
-    Tasks.hand_static: [OutputKeys.OUTPUT]
+    Tasks.hand_static: [OutputKeys.OUTPUT],
+
+    # {
+    #     'output': [
+    #                [2, 75, 287, 240, 510, 0.8335018754005432],
+    #                [1, 127, 83, 332, 366, 0.9175254702568054],
+    #                [0, 0, 0, 367, 639, 0.9693422317504883]]
+    # }
+    Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 4f6873b0..a14b07a6 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -183,6 +183,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                              'damo/cv_video-inpainting'),
     Tasks.hand_static: (Pipelines.hand_static,
                         'damo/cv_mobileface_hand-static'),
+    Tasks.face_human_hand_detection:
+    (Pipelines.face_human_hand_detection,
+     'damo/cv_nanodet_face-human-hand-detection'),
 }
 
 
diff --git a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
new file mode 100644
index 00000000..d9f214c9
--- /dev/null
+++ b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_human_hand_detection import det_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_human_hand_detection,
+    module_name=Pipelines.face_human_hand_detection)
+class NanoDettForFaceHumanHandDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create face-human-hand detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        result = det_infer.inference(self.model, self.device,
+                                     input['input_path'])
+        logger.info(result)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index b19c0fce..ac6846e4 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -43,6 +43,7 @@ class CVTasks(object):
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
     hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_face_human_hand_detection.py b/tests/pipelines/test_face_human_hand_detection.py
new file mode 100644
index 00000000..7aaa67e7
--- /dev/null
+++ b/tests/pipelines/test_face_human_hand_detection.py
@@ -0,0 +1,38 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class FaceHumanHandTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_nanodet_face-human-hand-detection'
+        self.input = {
+            'input_path': 'data/test/images/face_human_hand_detection.jpg',
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        logger.info(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_human_hand_detection = pipeline(
+            Tasks.face_human_hand_detection, model=self.model_id)
+        self.pipeline_inference(face_human_hand_detection, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        face_human_hand_detection = pipeline(Tasks.face_human_hand_detection)
+        self.pipeline_inference(face_human_hand_detection, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7f468acca37f91c2cc900c62caa65c653ea514d7 Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Sat, 1 Oct 2022 18:34:23 +0800
Subject: [PATCH 171/175] [to #42322933]style(license): add license + render
 result poses with video         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10263904

---
 .../cv/body_3d_keypoints/body_3d_pose.py      |   2 +
 .../canonical_pose_modules.py                 |   2 +-
 modelscope/outputs.py                         |  21 ++-
 .../cv/body_3d_keypoints_pipeline.py          | 157 +++++++++++++++++-
 tests/pipelines/test_body_3d_keypoints.py     |  19 ++-
 5 files changed, 183 insertions(+), 18 deletions(-)

diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
index 87cd4962..3e920d12 100644
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import logging
 import os.path as osp
 from typing import Any, Dict, List, Union
diff --git a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
index b3eac2e5..b7f0c4a3 100644
--- a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -1,4 +1,4 @@
-# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
+# The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 52f3c47e..f13bbed9 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -21,6 +21,7 @@ class OutputKeys(object):
     POLYGONS = 'polygons'
     OUTPUT = 'output'
     OUTPUT_IMG = 'output_img'
+    OUTPUT_VIDEO = 'output_video'
     OUTPUT_PCM = 'output_pcm'
     IMG_EMBEDDING = 'img_embedding'
     SPO_LIST = 'spo_list'
@@ -218,13 +219,21 @@ TASK_OUTPUTS = {
 
     # 3D human body keypoints detection result for single sample
     # {
-    #   "poses": [
-    #               [[x, y, z]*17],
-    #               [[x, y, z]*17],
-    #               [[x, y, z]*17]
-    #             ]
+    #   "poses": [		    # 3d pose coordinate in camera coordinate
+    #     	[[x, y, z]*17],	# joints of per image
+    #     	[[x, y, z]*17],
+    #     	...
+    #     ],
+    #   "timestamps": [     # timestamps of all frames
+    #     "00:00:0.230",
+    #     "00:00:0.560",
+    #     "00:00:0.690",
+    #   ],
+    #   "output_video": "path_to_rendered_video" , this is optional
+    # and is only avaialbe when the "render" option is enabled.
     # }
-    Tasks.body_3d_keypoints: [OutputKeys.POSES],
+    Tasks.body_3d_keypoints:
+    [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
 
     # 2D hand keypoints result for single sample
     # {
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index e9e4e9e8..474c0e54 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -1,10 +1,19 @@
-import os
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import datetime
 import os.path as osp
+import tempfile
 from typing import Any, Dict, List, Union
 
 import cv2
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits.mplot3d.axes3d as p3
 import numpy as np
 import torch
+from matplotlib import animation
+from matplotlib.animation import writers
+from matplotlib.ticker import MultipleLocator
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.body_3d_keypoints.body_3d_pose import (
@@ -16,6 +25,8 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
+matplotlib.use('Agg')
+
 logger = get_logger()
 
 
@@ -121,7 +132,13 @@ class Body3DKeypointsPipeline(Pipeline):
             device='gpu' if torch.cuda.is_available() else 'cpu')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        video_frames = self.read_video_frames(input)
+        video_url = input.get('input_video')
+        self.output_video_path = input.get('output_video_path')
+        if self.output_video_path is None:
+            self.output_video_path = tempfile.NamedTemporaryFile(
+                suffix='.mp4').name
+
+        video_frames = self.read_video_frames(video_url)
         if 0 == len(video_frames):
             res = {'success': False, 'msg': 'get video frame failed.'}
             return res
@@ -168,13 +185,21 @@ class Body3DKeypointsPipeline(Pipeline):
         return res
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        res = {OutputKeys.POSES: []}
+        res = {OutputKeys.POSES: [], OutputKeys.TIMESTAMPS: []}
 
         if not input['success']:
             pass
         else:
             poses = input[KeypointsTypes.POSES_CAMERA]
-            res = {OutputKeys.POSES: poses.data.cpu().numpy()}
+            pred_3d_pose = poses.data.cpu().numpy()[
+                0]  # [frame_num, joint_num, joint_dim]
+
+            if 'render' in self.keypoint_model_3d.cfg.keys():
+                self.render_prediction(pred_3d_pose)
+                res[OutputKeys.OUTPUT_VIDEO] = self.output_video_path
+
+            res[OutputKeys.POSES] = pred_3d_pose
+            res[OutputKeys.TIMESTAMPS] = self.timestamps
         return res
 
     def read_video_frames(self, video_url: Union[str, cv2.VideoCapture]):
@@ -189,7 +214,15 @@ class Body3DKeypointsPipeline(Pipeline):
         Returns:
             [nd.array]: List of video frames.
         """
+
+        def timestamp_format(seconds):
+            m, s = divmod(seconds, 60)
+            h, m = divmod(m, 60)
+            time = '%02d:%02d:%06.3f' % (h, m, s)
+            return time
+
         frames = []
+        self.timestamps = []  # for video render
         if isinstance(video_url, str):
             cap = cv2.VideoCapture(video_url)
             if not cap.isOpened():
@@ -199,15 +232,131 @@ class Body3DKeypointsPipeline(Pipeline):
         else:
             cap = video_url
 
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        if self.fps is None or self.fps <= 0:
+            raise Exception('modelscope error: %s cannot get video fps info.' %
+                            (video_url))
+
         max_frame_num = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME
         frame_idx = 0
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
+            self.timestamps.append(
+                timestamp_format(seconds=frame_idx / self.fps))
             frame_idx += 1
             frames.append(frame)
             if frame_idx >= max_frame_num:
                 break
         cap.release()
         return frames
+
+    def render_prediction(self, pose3d_cam_rr):
+        """render predict result 3d poses.
+
+        Args:
+            pose3d_cam_rr (nd.array): [frame_num, joint_num, joint_dim], 3d pose joints
+
+        Returns:
+        """
+        frame_num = pose3d_cam_rr.shape[0]
+
+        left_points = [11, 12, 13, 4, 5, 6]  # joints of left body
+        edges = [[0, 1], [0, 4], [0, 7], [1, 2], [4, 5], [5, 6], [2,
+                                                                  3], [7, 8],
+                 [8, 9], [8, 11], [8, 14], [14, 15], [15, 16], [11, 12],
+                 [12, 13], [9, 10]]  # connection between joints
+
+        fig = plt.figure()
+        ax = p3.Axes3D(fig)
+        x_major_locator = MultipleLocator(0.5)
+
+        ax.xaxis.set_major_locator(x_major_locator)
+        ax.yaxis.set_major_locator(x_major_locator)
+        ax.zaxis.set_major_locator(x_major_locator)
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        ax.set_xlim(-1, 1)
+        ax.set_ylim(-1, 1)
+        ax.set_zlim(-1, 1)
+        # view direction
+        azim = self.keypoint_model_3d.cfg.render.azim
+        elev = self.keypoint_model_3d.cfg.render.elev
+        ax.view_init(elev, azim)
+
+        # init plot, essentially
+        x = pose3d_cam_rr[0, :, 0]
+        y = pose3d_cam_rr[0, :, 1]
+        z = pose3d_cam_rr[0, :, 2]
+        points, = ax.plot(x, y, z, 'r.')
+
+        def renderBones(xs, ys, zs):
+            """render bones in skeleton
+
+            Args:
+                xs (nd.array): [joint_num, joint_channel]
+                ys (nd.array): [joint_num, joint_channel]
+                zs (nd.array): [joint_num, joint_channel]
+            """
+            bones = {}
+            for idx, edge in enumerate(edges):
+                index1, index2 = edge[0], edge[1]
+                if index1 in left_points:
+                    edge_color = 'red'
+                else:
+                    edge_color = 'blue'
+                connect = ax.plot([xs[index1], xs[index2]],
+                                  [ys[index1], ys[index2]],
+                                  [zs[index1], zs[index2]],
+                                  linewidth=2,
+                                  color=edge_color)  # plot edge
+                bones[idx] = connect[0]
+            return bones
+
+        bones = renderBones(x, y, z)
+
+        def update(frame_idx, points, bones):
+            """update animation
+
+            Args:
+                frame_idx (int): frame index
+                points (mpl_toolkits.mplot3d.art3d.Line3D): skeleton points ploter
+                bones (dict[int, mpl_toolkits.mplot3d.art3d.Line3D]): connection ploter
+
+            Returns:
+                tuple: points and bones ploter
+            """
+            xs = pose3d_cam_rr[frame_idx, :, 0]
+            ys = pose3d_cam_rr[frame_idx, :, 1]
+            zs = pose3d_cam_rr[frame_idx, :, 2]
+
+            # update bones
+            for idx, edge in enumerate(edges):
+                index1, index2 = edge[0], edge[1]
+                x1x2 = (xs[index1], xs[index2])
+                y1y2 = (ys[index1], ys[index2])
+                z1z2 = (zs[index1], zs[index2])
+                bones[idx].set_xdata(x1x2)
+                bones[idx].set_ydata(y1y2)
+                bones[idx].set_3d_properties(z1z2, 'z')
+
+            # update joints
+            points.set_data(xs, ys)
+            points.set_3d_properties(zs, 'z')
+            if 0 == frame_idx / 100:
+                logger.info(f'rendering {frame_idx}/{frame_num}')
+            return points, bones
+
+        ani = animation.FuncAnimation(
+            fig=fig,
+            func=update,
+            frames=frame_num,
+            interval=self.fps,
+            fargs=(points, bones))
+
+        # save mp4
+        Writer = writers['ffmpeg']
+        writer = Writer(fps=self.fps, metadata={}, bitrate=4096)
+        ani.save(self.output_video_path, writer=writer)
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 9dce0d19..bde04f8e 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -28,7 +28,12 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_modelhub_with_video_file(self):
         body_3d_keypoints = pipeline(
             Tasks.body_3d_keypoints, model=self.model_id)
-        self.pipeline_inference(body_3d_keypoints, self.test_video)
+        pipeline_input = {
+            'input_video': self.test_video,
+            'output_video_path': './result.mp4'
+        }
+        self.pipeline_inference(
+            body_3d_keypoints, pipeline_input=pipeline_input)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub_with_video_stream(self):
@@ -37,12 +42,12 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
         if not cap.isOpened():
             raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
                             % (self.test_video))
-        self.pipeline_inference(body_3d_keypoints, cap)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_modelhub_default_model(self):
-        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
-        self.pipeline_inference(body_3d_keypoints, self.test_video)
+        pipeline_input = {
+            'input_video': cap,
+            'output_video_path': './result.mp4'
+        }
+        self.pipeline_inference(
+            body_3d_keypoints, pipeline_input=pipeline_input)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_demo_compatibility(self):

From 5343c899fbaac1f33bdb208c8e99944af962ca7a Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Sat, 1 Oct 2022 18:35:42 +0800
Subject: [PATCH 172/175] [to #42322933] add face-emotion pipeline        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10202117

---
 data/test/images/face_emotion.jpg             |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/face_emotion/__init__.py |  20 +
 .../cv/face_emotion/efficient/__init__.py     |   6 +
 .../models/cv/face_emotion/efficient/model.py | 380 ++++++++++++
 .../models/cv/face_emotion/efficient/utils.py | 559 ++++++++++++++++++
 .../models/cv/face_emotion/emotion_infer.py   |  67 +++
 .../models/cv/face_emotion/emotion_model.py   |  96 +++
 .../face_emotion/face_alignment/__init__.py   |   0
 .../cv/face_emotion/face_alignment/face.py    |  79 +++
 .../face_emotion/face_alignment/face_align.py |  59 ++
 modelscope/outputs.py                         |   6 +-
 modelscope/pipelines/builder.py               |   1 +
 .../pipelines/cv/face_emotion_pipeline.py     |  39 ++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_face_emotion.py          |  32 +
 16 files changed, 1349 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/face_emotion.jpg
 create mode 100644 modelscope/models/cv/face_emotion/__init__.py
 create mode 100644 modelscope/models/cv/face_emotion/efficient/__init__.py
 create mode 100644 modelscope/models/cv/face_emotion/efficient/model.py
 create mode 100644 modelscope/models/cv/face_emotion/efficient/utils.py
 create mode 100644 modelscope/models/cv/face_emotion/emotion_infer.py
 create mode 100644 modelscope/models/cv/face_emotion/emotion_model.py
 create mode 100644 modelscope/models/cv/face_emotion/face_alignment/__init__.py
 create mode 100644 modelscope/models/cv/face_emotion/face_alignment/face.py
 create mode 100644 modelscope/models/cv/face_emotion/face_alignment/face_align.py
 create mode 100644 modelscope/pipelines/cv/face_emotion_pipeline.py
 create mode 100644 tests/pipelines/test_face_emotion.py

diff --git a/data/test/images/face_emotion.jpg b/data/test/images/face_emotion.jpg
new file mode 100644
index 00000000..54f22280
--- /dev/null
+++ b/data/test/images/face_emotion.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e
+size 123341
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 54e09f7a..ae8b5297 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -41,6 +41,7 @@ class Models(object):
     video_inpainting = 'video-inpainting'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -183,6 +184,7 @@ class Pipelines(object):
     pst_action_recognition = 'patchshift-action-recognition'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/face_emotion/__init__.py b/modelscope/models/cv/face_emotion/__init__.py
new file mode 100644
index 00000000..2a13ea42
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .emotion_model import EfficientNetForFaceEmotion
+
+else:
+    _import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_emotion/efficient/__init__.py b/modelscope/models/cv/face_emotion/efficient/__init__.py
new file mode 100644
index 00000000..e8fc91a4
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/__init__.py
@@ -0,0 +1,6 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+from .model import VALID_MODELS, EfficientNet
+from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet,
+                    get_model_params)
diff --git a/modelscope/models/cv/face_emotion/efficient/model.py b/modelscope/models/cv/face_emotion/efficient/model.py
new file mode 100644
index 00000000..db303016
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/model.py
@@ -0,0 +1,380 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size,
+                    drop_connect, efficientnet_params, get_model_params,
+                    get_same_padding_conv2d, load_pretrained_weights,
+                    round_filters, round_repeats)
+
+VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2',
+                'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5',
+                'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8',
+                'efficientnet-l2')
+
+
+class MBConvBlock(nn.Module):
+
+    def __init__(self, block_args, global_params, image_size=None):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = 1 - global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio
+                       is not None) and (0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip
+
+        inp = self._block_args.input_filters
+        oup = self._block_args.input_filters * self._block_args.expand_ratio
+        if self._block_args.expand_ratio != 1:
+            Conv2d = get_same_padding_conv2d(image_size=image_size)
+            self._expand_conv = Conv2d(
+                in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(
+                num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._depthwise_conv = Conv2d(
+            in_channels=oup,
+            out_channels=oup,
+            groups=oup,
+            kernel_size=k,
+            stride=s,
+            bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        image_size = calculate_output_image_size(image_size, s)
+
+        if self.has_se:
+            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
+            num_squeezed_channels = max(
+                1,
+                int(self._block_args.input_filters
+                    * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(
+                in_channels=oup,
+                out_channels=num_squeezed_channels,
+                kernel_size=1)
+            self._se_expand = Conv2d(
+                in_channels=num_squeezed_channels,
+                out_channels=oup,
+                kernel_size=1)
+
+        final_oup = self._block_args.output_filters
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._project_conv = Conv2d(
+            in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(
+            num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """MBConvBlock's forward function.
+        Args:
+            inputs (tensor): Input tensor.
+            drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
+        Returns:
+            Output of this block after processing.
+        """
+
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._expand_conv(inputs)
+            x = self._bn0(x)
+            x = self._swish(x)
+
+        x = self._depthwise_conv(x)
+        x = self._bn1(x)
+        x = self._swish(x)
+
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_reduce(x_squeezed)
+            x_squeezed = self._swish(x_squeezed)
+            x_squeezed = self._se_expand(x_squeezed)
+            x = torch.sigmoid(x_squeezed) * x
+
+        x = self._project_conv(x)
+        x = self._bn2(x)
+
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                x = drop_connect(
+                    x, p=drop_connect_rate, training=self.training)
+            x = x + inputs
+        return x
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+
+
+class EfficientNet(nn.Module):
+    """EfficientNet model.
+       Most easily loaded with the .from_name or .from_pretrained methods.
+    Args:
+        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
+        global_params (namedtuple): A set of GlobalParams shared between blocks.
+    References:
+        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
+    Example:
+        >>> import torch
+        >>> from efficientnet.model import EfficientNet
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+        >>> model.eval()
+        >>> outputs = model(inputs)
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+        image_size = global_params.image_size
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+
+        in_channels = 3
+        out_channels = round_filters(32, self._global_params)
+        self._conv_stem = Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        image_size = calculate_output_image_size(image_size, 2)
+
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         self._global_params))
+
+            self._blocks.append(
+                MBConvBlock(
+                    block_args, self._global_params, image_size=image_size))
+            image_size = calculate_output_image_size(image_size,
+                                                     block_args.stride)
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(
+                    MBConvBlock(
+                        block_args, self._global_params,
+                        image_size=image_size))
+
+        in_channels = block_args.output_filters
+        out_channels = round_filters(1280, self._global_params)
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._conv_head = Conv2d(
+            in_channels, out_channels, kernel_size=1, bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
+        if self._global_params.include_top:
+            self._dropout = nn.Dropout(self._global_params.dropout_rate)
+            self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+
+        self._swish = MemoryEfficientSwish()
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+
+    def extract_endpoints(self, inputs):
+        """Use convolution layer to extract features
+        from reduction levels i in [1, 2, 3, 4, 5].
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Dictionary of last intermediate features
+            with reduction levels i in [1, 2, 3, 4, 5].
+            Example:
+                >>> import torch
+                >>> from efficientnet.model import EfficientNet
+                >>> inputs = torch.rand(1, 3, 224, 224)
+                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+                >>> endpoints = model.extract_endpoints(inputs)
+                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
+                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
+                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
+                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
+                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
+                >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
+        """
+        endpoints = dict()
+
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+        prev_x = x
+
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(
+                    self._blocks)  # scale drop connect_rate
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if prev_x.size(2) > x.size(2):
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x
+            elif idx == len(self._blocks) - 1:
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+            prev_x = x
+
+        x = self._swish(self._bn1(self._conv_head(x)))
+        endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+
+        return endpoints
+
+    def extract_features(self, inputs):
+        """use convolution layer to extract feature .
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+        x = self._swish(self._bn1(self._conv_head(x)))
+
+        return x
+
+    def forward(self, inputs):
+        """EfficientNet's forward function.
+           Calls extract_features to extract features, applies final linear layer, and returns logits.
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of this model after processing.
+        """
+        x = self.extract_features(inputs)
+        x = self._avg_pooling(x)
+        if self._global_params.include_top:
+            x = x.flatten(start_dim=1)
+            x = self._dropout(x)
+            x = self._fc(x)
+        return x
+
+    @classmethod
+    def from_name(cls, model_name, in_channels=3, **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            in_channels (int): Input data's channel number.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'num_classes', 'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            An efficientnet model.
+        """
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name,
+                                                      override_params)
+        model = cls(blocks_args, global_params)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name,
+                        weights_path=None,
+                        advprop=False,
+                        in_channels=3,
+                        num_classes=1000,
+                        **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            weights_path (None or str):
+                str: path to pretrained weights file on the local disk.
+                None: use pretrained weights downloaded from the Internet.
+            advprop (bool):
+                Whether to load pretrained weights
+                trained with advprop (valid when weights_path is None).
+            in_channels (int): Input data's channel number.
+            num_classes (int):
+                Number of categories for classification.
+                It controls the output size for final linear layer.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            A pretrained efficientnet model.
+        """
+        model = cls.from_name(
+            model_name, num_classes=num_classes, **override_params)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        """Get the input image size for a given efficientnet model.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            Input image size (resolution).
+        """
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """Validates model name.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            bool: Is a valid name or not.
+        """
+        if model_name not in VALID_MODELS:
+            raise ValueError('model_name should be one of: '
+                             + ', '.join(VALID_MODELS))
+
+    def _change_in_channels(self, in_channels):
+        """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
+        Args:
+            in_channels (int): Input data's channel number.
+        """
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(
+                image_size=self._global_params.image_size)
+            out_channels = round_filters(32, self._global_params)
+            self._conv_stem = Conv2d(
+                in_channels, out_channels, kernel_size=3, stride=2, bias=False)
diff --git a/modelscope/models/cv/face_emotion/efficient/utils.py b/modelscope/models/cv/face_emotion/efficient/utils.py
new file mode 100644
index 00000000..6cae70fc
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/utils.py
@@ -0,0 +1,559 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+import collections
+import math
+import re
+from functools import partial
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils import model_zoo
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
+    'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
+    'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top'
+])
+
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters',
+    'output_filters', 'se_ratio', 'id_skip'
+])
+
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+if hasattr(nn, 'SiLU'):
+    Swish = nn.SiLU
+else:
+
+    class Swish(nn.Module):
+
+        def forward(self, x):
+            return x * torch.sigmoid(x)
+
+
+class SwishImplementation(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_tensors[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+
+
+class MemoryEfficientSwish(nn.Module):
+
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+
+
+def round_filters(filters, global_params):
+    """Calculate and round number of filters based on width multiplier.
+       Use width_coefficient, depth_divisor and min_depth of global_params.
+    Args:
+        filters (int): Filters number to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new_filters: New filters number after calculating.
+    """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """Calculate module's repeat number of a block based on depth multiplier.
+       Use depth_coefficient of global_params.
+    Args:
+        repeats (int): num_repeat to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new repeat: New repeat number after calculating.
+    """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, p, training):
+    """Drop connect.
+    Args:
+        input (tensor: BCWH): Input of this structure.
+        p (float: 0.0~1.0): Probability of drop connection.
+        training (bool): The running mode.
+    Returns:
+        output: Output after drop connection.
+    """
+    assert 0 <= p <= 1, 'p must be in range of [0,1]'
+
+    if not training:
+        return inputs
+
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1],
+                                dtype=inputs.dtype,
+                                device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+def get_width_and_height_from_size(x):
+    """Obtain height and width from x.
+    Args:
+        x (int, tuple or list): Data size.
+    Returns:
+        size: A tuple or list (H,W).
+    """
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, list) or isinstance(x, tuple):
+        return x
+    else:
+        raise TypeError()
+
+
+def calculate_output_image_size(input_image_size, stride):
+    """Calculates the output image size when using Conv2dSamePadding with a stride.
+       Necessary for static padding. Thanks to mannatsingh for pointing this out.
+    Args:
+        input_image_size (int, tuple or list): Size of input image.
+        stride (int, tuple or list): Conv2d operation's stride.
+    Returns:
+        output_image_size: A list [H,W].
+    """
+    if input_image_size is None:
+        return None
+    image_height, image_width = get_width_and_height_from_size(
+        input_image_size)
+    stride = stride if isinstance(stride, int) else stride[0]
+    image_height = int(math.ceil(image_height / stride))
+    image_width = int(math.ceil(image_width / stride))
+    return [image_height, image_width]
+
+
+def get_same_padding_conv2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
+    """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+
+
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow, for a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+        self.stride = self.stride if len(
+            self.stride) == 2 else [self.stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        a1 = (oh - 1) * self.stride[0]
+        pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        a2 = (ow - 1) * self.stride[1]
+        pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 image_size=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         **kwargs)
+        self.stride = self.stride if len(
+            self.stride) == 2 else [self.stride[0]] * 2
+
+        assert image_size is not None
+        ih, iw = (image_size,
+                  image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        b1 = (oh - 1) * self.stride[0]
+        pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        b2 = (ow - 1) * self.stride[1]
+        pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                 pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                     self.dilation, self.groups)
+        return x
+
+
+def get_same_padding_maxPool2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
+    """
+    if image_size is None:
+        return MaxPool2dDynamicSamePadding
+    else:
+        return partial(MaxPool2dStaticSamePadding, image_size=image_size)
+
+
+class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 dilation=1,
+                 return_indices=False,
+                 ceil_mode=False):
+        super().__init__(kernel_size, stride, padding, dilation,
+                         return_indices, ceil_mode)
+        self.stride = [self.stride] * 2 if isinstance(self.stride,
+                                                      int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(
+            self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(
+            self.dilation, int) else self.dilation
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        c1 = (oh - 1) * self.stride[0]
+        pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        c2 = (ow - 1) * self.stride[1]
+        pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                            self.dilation, self.ceil_mode, self.return_indices)
+
+
+class MaxPool2dStaticSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    def __init__(self, kernel_size, stride, image_size=None, **kwargs):
+        super().__init__(kernel_size, stride, **kwargs)
+        self.stride = [self.stride] * 2 if isinstance(self.stride,
+                                                      int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(
+            self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(
+            self.dilation, int) else self.dilation
+
+        assert image_size is not None
+        ih, iw = (image_size,
+                  image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        d1 = (oh - 1) * self.stride[0]
+        pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        d2 = (ow - 1) * self.stride[1]
+        pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                 pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                         self.dilation, self.ceil_mode, self.return_indices)
+        return x
+
+
+class BlockDecoder(object):
+    """Block Decoder for readability,
+       straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """Get a block through a string notation of arguments.
+        Args:
+            block_string (str): A string notation of arguments.
+                                Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
+        Returns:
+            BlockArgs: The namedtuple defined at the top of this file.
+        """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1)
+                or (len(options['s']) == 2
+                    and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            num_repeat=int(options['r']),
+            kernel_size=int(options['k']),
+            stride=[int(options['s'][0])],
+            expand_ratio=int(options['e']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            id_skip=('noskip' not in block_string))
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encode a block to a string.
+        Args:
+            block (namedtuple): A BlockArgs type argument.
+        Returns:
+            block_string: A String form of BlockArgs.
+        """
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """Decode a list of string notations to specify blocks inside the network.
+        Args:
+            string_list (list[str]): A list of strings, each string is a notation of block.
+        Returns:
+            blocks_args: A list of BlockArgs namedtuples of block args.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """Encode a list of BlockArgs to a list of strings.
+        Args:
+            blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
+        Returns:
+            block_strings: A list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet_params(model_name):
+    """Map EfficientNet model name to parameter coefficients.
+    Args:
+        model_name (str): Model name to be queried.
+    Returns:
+        params_dict[model_name]: A (width,depth,res,dropout) tuple.
+    """
+    params_dict = {
+        'efficientnet-b0': (1.0, 1.0, 112, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 image_size=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2,
+                 num_classes=1000,
+                 include_top=True):
+    """Create BlockArgs and GlobalParams for efficientnet model.
+    Args:
+        width_coefficient (float)
+        depth_coefficient (float)
+        image_size (int)
+        dropout_rate (float)
+        drop_connect_rate (float)
+        num_classes (int)
+        Meaning as the name suggests.
+    Returns:
+        blocks_args, global_params.
+    """
+
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        image_size=image_size,
+        dropout_rate=dropout_rate,
+        num_classes=num_classes,
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        drop_connect_rate=drop_connect_rate,
+        depth_divisor=8,
+        min_depth=None,
+        include_top=include_top,
+    )
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """Get the block args and global params for a given model name.
+    Args:
+        model_name (str): Model's name.
+        override_params (dict): A dict to modify global_params.
+    Returns:
+        blocks_args, global_params
+    """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w,
+            depth_coefficient=d,
+            dropout_rate=p,
+            image_size=s)
+    else:
+        raise NotImplementedError(
+            'model name is not pre-defined: {}'.format(model_name))
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+def load_pretrained_weights(model,
+                            model_name,
+                            weights_path=None,
+                            load_fc=True,
+                            advprop=False,
+                            verbose=True):
+    """Loads pretrained weights from weights path or download using url.
+    Args:
+        model (Module): The whole model of efficientnet.
+        model_name (str): Model name of efficientnet.
+        weights_path (None or str):
+            str: path to pretrained weights file on the local disk.
+            None: use pretrained weights downloaded from the Internet.
+        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
+        advprop (bool): Whether to load pretrained weights
+                        trained with advprop (valid when weights_path is None).
+    """
+    if isinstance(weights_path, str):
+        state_dict = torch.load(weights_path)
+    else:
+        url_map_ = url_map_advprop if advprop else url_map
+        state_dict = model_zoo.load_url(url_map_[model_name])
+
+    if load_fc:
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
+            ret.missing_keys)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert set(ret.missing_keys) == set([
+            '_fc.weight', '_fc.bias'
+        ]), 'Missing keys when loading pretrained weights: {}'.format(
+            ret.missing_keys)
+    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(
+        ret.unexpected_keys)
+
+    if verbose:
+        print('Loaded pretrained weights for {}'.format(model_name))
diff --git a/modelscope/models/cv/face_emotion/emotion_infer.py b/modelscope/models/cv/face_emotion/emotion_infer.py
new file mode 100644
index 00000000..e3398592
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/emotion_infer.py
@@ -0,0 +1,67 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import torch
+from PIL import Image
+from torch import nn
+from torchvision import transforms
+
+from modelscope.utils.logger import get_logger
+from .face_alignment.face_align import face_detection_PIL_v2
+
+logger = get_logger()
+
+
+def transform_PIL(img_pil):
+    val_transforms = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    return val_transforms(img_pil)
+
+
+index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26]
+emotion_list = [
+    'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise'
+]
+
+
+def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
+    image = Image.open(image_path).convert('RGB')
+
+    face, bbox = face_detection_PIL_v2(image, face_model)
+    if bbox is None:
+        logger.warn('no face detected!')
+        result = {'emotion_result': None, 'box': None}
+        return result
+
+    face = transform_PIL(face)
+    face = face.unsqueeze(0)
+    if torch.cuda.is_available():
+        face = face.cuda(GPU)
+    logits_AU, logits_emotion = model(face)
+    logits_AU = torch.sigmoid(logits_AU)
+    logits_emotion = nn.functional.softmax(logits_emotion, 1)
+
+    _, index_list = logits_emotion.max(1)
+    emotion_index = index_list[0].data.item()
+    prob = logits_emotion[0][emotion_index]
+    if prob > score_thre and emotion_index != 3:
+        cur_emotion = emotion_list[emotion_index]
+    else:
+        cur_emotion = 'Neutral'
+
+    logits_AU = logits_AU[0]
+    au_ouput = torch.zeros_like(logits_AU)
+    au_ouput[logits_AU >= score_thre] = 1
+    au_ouput[logits_AU < score_thre] = 0
+
+    au_ouput = au_ouput.int()
+
+    cur_au_list = []
+    for idx in range(au_ouput.shape[0]):
+        if au_ouput[idx] == 1:
+            au = index2AU[idx]
+            cur_au_list.append(au)
+    cur_au_list.sort()
+    result = (cur_emotion, bbox)
+    return result
diff --git a/modelscope/models/cv/face_emotion/emotion_model.py b/modelscope/models/cv/face_emotion/emotion_model.py
new file mode 100644
index 00000000..f8df9c37
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/emotion_model.py
@@ -0,0 +1,96 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.face_emotion.efficient import EfficientNet
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion)
+class EfficientNetForFaceEmotion(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = FaceEmotionModel(
+            name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7)
+
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=self.device)
+
+        state_dict = pretrained_params['model']
+        new_state = {}
+        for k, v in state_dict.items():
+            if k.startswith('module.'):
+                k = k[7:]
+            new_state[k] = v
+
+        self.model.load_state_dict(new_state)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        logits_au, logits_emotion = self.model(x)
+        return logits_au, logits_emotion
+
+
+class FaceEmotionModel(nn.Module):
+
+    def __init__(self,
+                 name='efficientnet-b0',
+                 num_embed=512,
+                 num_au=12,
+                 num_emotion=7):
+        super(FaceEmotionModel, self).__init__()
+        self.backbone = EfficientNet.from_pretrained(
+            name, weights_path=None, advprop=True)
+        self.average_pool = nn.AdaptiveAvgPool2d(1)
+        self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1],
+                               num_embed)
+        self.features = nn.BatchNorm1d(num_embed)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        self.fc_au = nn.Sequential(
+            nn.Dropout(0.6),
+            nn.Linear(num_embed, num_au),
+        )
+        self.fc_emotion = nn.Sequential(
+            nn.Dropout(0.6),
+            nn.Linear(num_embed, num_emotion),
+        )
+
+    def feat_single_img(self, x):
+        x = self.backbone.extract_features(x)
+        x = self.average_pool(x)
+        x = x.flatten(1)
+        x = self.embed(x)
+        x = self.features(x)
+        return x
+
+    def forward(self, x):
+        x = self.feat_single_img(x)
+        logits_au = self.fc_au(x)
+        att_au = torch.sigmoid(logits_au).unsqueeze(-1)
+        x = x.unsqueeze(1)
+        emotion_vec_list = torch.matmul(att_au, x)
+        emotion_vec = emotion_vec_list.sum(1)
+        logits_emotion = self.fc_emotion(emotion_vec)
+        return logits_au, logits_emotion
diff --git a/modelscope/models/cv/face_emotion/face_alignment/__init__.py b/modelscope/models/cv/face_emotion/face_alignment/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_emotion/face_alignment/face.py b/modelscope/models/cv/face_emotion/face_alignment/face.py
new file mode 100644
index 00000000..a362bddc
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/face_alignment/face.py
@@ -0,0 +1,79 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+
+import cv2
+import numpy as np
+import tensorflow as tf
+
+
+def init(mod):
+    PATH_TO_CKPT = mod
+    net = tf.Graph()
+    with net.as_default():
+        od_graph_def = tf.GraphDef()
+        config = tf.ConfigProto()
+        config.gpu_options.per_process_gpu_memory_fraction = 0.6
+        with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
+            serialized_graph = fid.read()
+            od_graph_def.ParseFromString(serialized_graph)
+            tf.import_graph_def(od_graph_def, name='')
+            sess = tf.Session(graph=net, config=config)
+    return sess, net
+
+
+def filter_bboxes_confs(shape,
+                        imgsBboxes,
+                        imgsConfs,
+                        single=False,
+                        thresh=0.5):
+    [w, h] = shape
+    if single:
+        bboxes, confs = [], []
+        for y in range(len(imgsBboxes)):
+            if imgsConfs[y] >= thresh:
+                [x1, y1, x2, y2] = list(imgsBboxes[y])
+                x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int(
+                    h * y2)
+                bboxes.append([y1, x1, y2, x2])
+                confs.append(imgsConfs[y])
+        return bboxes, confs
+    else:
+        retImgsBboxes, retImgsConfs = [], []
+        for x in range(len(imgsBboxes)):
+            bboxes, confs = [], []
+            for y in range(len(imgsBboxes[x])):
+                if imgsConfs[x][y] >= thresh:
+                    [x1, y1, x2, y2] = list(imgsBboxes[x][y])
+                    x1, y1, x2, y2 = int(w * x1), int(h * y1), int(
+                        w * x2), int(h * y2)
+                    bboxes.append([y1, x1, y2, x2])
+                    confs.append(imgsConfs[x][y])
+            retImgsBboxes.append(bboxes)
+            retImgsConfs.append(confs)
+        return retImgsBboxes, retImgsConfs
+
+
+def detect(im, sess, net):
+    image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    image_np_expanded = np.expand_dims(image_np, axis=0)
+    image_tensor = net.get_tensor_by_name('image_tensor:0')
+    bboxes = net.get_tensor_by_name('detection_boxes:0')
+    dConfs = net.get_tensor_by_name('detection_scores:0')
+    classes = net.get_tensor_by_name('detection_classes:0')
+    num_detections = net.get_tensor_by_name('num_detections:0')
+    (bboxes, dConfs, classes,
+     num_detections) = sess.run([bboxes, dConfs, classes, num_detections],
+                                feed_dict={image_tensor: image_np_expanded})
+    w, h, _ = im.shape
+    bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True)
+    return bboxes, confs
+
+
+class FaceDetector:
+
+    def __init__(self, mod):
+        self.sess, self.net = init(mod)
+
+    def do_detect(self, im):
+        bboxes, confs = detect(im, self.sess, self.net)
+        return bboxes, confs
diff --git a/modelscope/models/cv/face_emotion/face_alignment/face_align.py b/modelscope/models/cv/face_emotion/face_alignment/face_align.py
new file mode 100644
index 00000000..71282b12
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/face_alignment/face_align.py
@@ -0,0 +1,59 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import cv2
+import numpy as np
+from PIL import Image, ImageFile
+
+from .face import FaceDetector
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def adjust_bx_v2(box, w, h):
+    x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+    box_w = x2 - x1
+    box_h = y2 - y1
+    delta = abs(box_w - box_h)
+    if box_w > box_h:
+        if y1 >= delta:
+            y1 = y1 - delta
+        else:
+            delta_y1 = y1
+            y1 = 0
+            delta_y2 = delta - delta_y1
+            y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1
+    else:
+        if x1 >= delta / 2 and x2 <= w - delta / 2:
+            x1 = x1 - delta / 2
+            x2 = x2 + delta / 2
+        elif x1 < delta / 2 and x2 <= w - delta / 2:
+            delta_x1 = x1
+            x1 = 0
+            delta_x2 = delta - delta_x1
+            x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1
+        elif x1 >= delta / 2 and x2 > w - delta / 2:
+            delta_x2 = w - x2
+            x2 = w - 1
+            delta_x1 = delta - x1
+            x1 = x1 - delta_x1 if x1 >= delta_x1 else 0
+
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    return [x1, y1, x2, y2]
+
+
+def face_detection_PIL_v2(image, face_model):
+    crop_size = 112
+    face_detector = FaceDetector(face_model)
+    img = np.array(image)
+    h, w = img.shape[0:2]
+    bxs, conf = face_detector.do_detect(img)
+    bx = bxs[0]
+    bx = adjust_bx_v2(bx, w, h)
+    x1, y1, x2, y2 = bx
+    image = img[y1:y2, x1:x2, :]
+    img = Image.fromarray(image)
+    img = img.resize((crop_size, crop_size))
+    bx = tuple(bx)
+    return img, bx
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index f13bbed9..9811811e 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -664,11 +664,15 @@ TASK_OUTPUTS = {
     # }
     Tasks.hand_static: [OutputKeys.OUTPUT],
 
-    # {
     #     'output': [
     #                [2, 75, 287, 240, 510, 0.8335018754005432],
     #                [1, 127, 83, 332, 366, 0.9175254702568054],
     #                [0, 0, 0, 367, 639, 0.9693422317504883]]
     # }
     Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
+
+    # {
+    #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
+    # }
+    Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES]
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index a14b07a6..ff56658f 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -186,6 +186,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.face_human_hand_detection:
     (Pipelines.face_human_hand_detection,
      'damo/cv_nanodet_face-human-hand-detection'),
+    Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
 }
 
 
diff --git a/modelscope/pipelines/cv/face_emotion_pipeline.py b/modelscope/pipelines/cv/face_emotion_pipeline.py
new file mode 100644
index 00000000..249493b6
--- /dev/null
+++ b/modelscope/pipelines/cv/face_emotion_pipeline.py
@@ -0,0 +1,39 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_emotion import emotion_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_emotion, module_name=Pipelines.face_emotion)
+class FaceEmotionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create face emotion pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        self.face_model = model + '/' + ModelFile.TF_GRAPH_FILE
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result, bbox = emotion_infer.inference(input['img_path'], self.model,
+                                               self.face_model)
+        return {OutputKeys.OUTPUT: result, OutputKeys.BOXES: bbox}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index ac6846e4..4aff1d05 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -44,6 +44,7 @@ class CVTasks(object):
     shop_segmentation = 'shop-segmentation'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_face_emotion.py b/tests/pipelines/test_face_emotion.py
new file mode 100644
index 00000000..907e15ee
--- /dev/null
+++ b/tests/pipelines/test_face_emotion.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaceEmotionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_face-emotion'
+        self.img = {'img_path': 'data/test/images/face_emotion.jpg'}
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_emotion = pipeline(Tasks.face_emotion, model=self.model)
+        self.pipeline_inference(face_emotion, self.img)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        face_emotion = pipeline(Tasks.face_emotion)
+        self.pipeline_inference(face_emotion, self.img)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 60f17a49d694c4e0da62bf63bfd9945f18c085ec Mon Sep 17 00:00:00 2001
From: "biwen.lbw" <biwen.lbw@alibaba-inc.com>
Date: Sat, 1 Oct 2022 18:37:46 +0800
Subject: [PATCH 173/175] [to #42322933]test image url & add license headers   
      Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10313149

---
 .../cv/skin_retouching/detection_model/detection_module.py       | 1 +
 .../cv/skin_retouching/detection_model/detection_unet_in.py      | 1 +
 modelscope/models/cv/skin_retouching/inpainting_model/gconv.py   | 1 +
 .../cv/skin_retouching/inpainting_model/inpainting_unet.py       | 1 +
 modelscope/models/cv/skin_retouching/unet_deploy.py              | 1 +
 modelscope/models/cv/skin_retouching/utils.py                    | 1 +
 modelscope/models/cv/skin_retouching/weights_init.py             | 1 +
 modelscope/pipelines/cv/skin_retouching_pipeline.py              | 1 +
 8 files changed, 8 insertions(+)

diff --git a/modelscope/models/cv/skin_retouching/detection_model/detection_module.py b/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
index f89ce37b..5db9c44c 100644
--- a/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py b/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
index b48f6e5f..c0be1a52 100644
--- a/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py b/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
index e0910d2c..8b3eb2fc 100644
--- a/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py b/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
index 09cea1fc..dd220dd6 100644
--- a/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/skin_retouching/unet_deploy.py b/modelscope/models/cv/skin_retouching/unet_deploy.py
index cb37b04c..0ff75b85 100755
--- a/modelscope/models/cv/skin_retouching/unet_deploy.py
+++ b/modelscope/models/cv/skin_retouching/unet_deploy.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import warnings
 
 import torch
diff --git a/modelscope/models/cv/skin_retouching/utils.py b/modelscope/models/cv/skin_retouching/utils.py
index 12653f41..eb0da6b9 100644
--- a/modelscope/models/cv/skin_retouching/utils.py
+++ b/modelscope/models/cv/skin_retouching/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import time
 from typing import Dict, List, Optional, Tuple, Union
 
diff --git a/modelscope/models/cv/skin_retouching/weights_init.py b/modelscope/models/cv/skin_retouching/weights_init.py
index efd24843..ae62d4a4 100644
--- a/modelscope/models/cv/skin_retouching/weights_init.py
+++ b/modelscope/models/cv/skin_retouching/weights_init.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/pipelines/cv/skin_retouching_pipeline.py b/modelscope/pipelines/cv/skin_retouching_pipeline.py
index f8c9de60..c6571bef 100644
--- a/modelscope/pipelines/cv/skin_retouching_pipeline.py
+++ b/modelscope/pipelines/cv/skin_retouching_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 

From b636d1c08a9603ac043619d473dddfbdba350c26 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Sat, 1 Oct 2022 20:35:13 +0800
Subject: [PATCH 174/175] [to #42322933]add copyright on
 mogface,mtcnn,retinaface,ulfd,fer related files         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10305661

---
 modelscope/models/cv/face_detection/mogface/__init__.py          | 1 +
 modelscope/models/cv/face_detection/mtcnn/__init__.py            | 1 +
 modelscope/models/cv/face_detection/retinaface/__init__.py       | 1 +
 modelscope/models/cv/face_detection/ulfd_slim/__init__.py        | 1 +
 .../pipelines/cv/facial_expression_recognition_pipeline.py       | 1 +
 5 files changed, 5 insertions(+)

diff --git a/modelscope/models/cv/face_detection/mogface/__init__.py b/modelscope/models/cv/face_detection/mogface/__init__.py
index 8190b649..a58268d0 100644
--- a/modelscope/models/cv/face_detection/mogface/__init__.py
+++ b/modelscope/models/cv/face_detection/mogface/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .models.detectors import MogFaceDetector
diff --git a/modelscope/models/cv/face_detection/mtcnn/__init__.py b/modelscope/models/cv/face_detection/mtcnn/__init__.py
index b11c4740..9fddab9c 100644
--- a/modelscope/models/cv/face_detection/mtcnn/__init__.py
+++ b/modelscope/models/cv/face_detection/mtcnn/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .models.detector import MtcnnFaceDetector
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
index 779aaf1c..e7b589a1 100644
--- a/modelscope/models/cv/face_detection/retinaface/__init__.py
+++ b/modelscope/models/cv/face_detection/retinaface/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .detection import RetinaFaceDetection
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
index 41a2226a..af1e7b42 100644
--- a/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .detection import UlfdFaceDetector
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index c5577dcf..1b1f13d1 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 

From a079ab922f4604a5587e92e742809c0793a5fc6d Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Sat, 1 Oct 2022 21:46:40 +0800
Subject: [PATCH 175/175] [to #42322933] add product-segmentation pipeline     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10252583

---
 data/test/images/product_segmentation.jpg     |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/product_segmentation/__init__.py       |  20 ++
 .../models/cv/product_segmentation/net.py     | 197 ++++++++++++++++++
 .../cv/product_segmentation/seg_infer.py      |  77 +++++++
 modelscope/outputs.py                         |   9 +-
 modelscope/pipelines/builder.py               |   2 +
 .../cv/product_segmentation_pipeline.py       |  40 ++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_product_segmentation.py  |  43 ++++
 10 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/product_segmentation.jpg
 create mode 100644 modelscope/models/cv/product_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/product_segmentation/net.py
 create mode 100644 modelscope/models/cv/product_segmentation/seg_infer.py
 create mode 100644 modelscope/pipelines/cv/product_segmentation_pipeline.py
 create mode 100644 tests/pipelines/test_product_segmentation.py

diff --git a/data/test/images/product_segmentation.jpg b/data/test/images/product_segmentation.jpg
new file mode 100644
index 00000000..c188a69e
--- /dev/null
+++ b/data/test/images/product_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec
+size 108357
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ae8b5297..33273502 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -42,6 +42,7 @@ class Models(object):
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -185,6 +186,7 @@ class Pipelines(object):
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/product_segmentation/__init__.py b/modelscope/models/cv/product_segmentation/__init__.py
new file mode 100644
index 00000000..e87c8db1
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .seg_infer import F3NetProductSegmentation
+
+else:
+    _import_structure = {'seg_infer': ['F3NetProductSegmentation']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/product_segmentation/net.py b/modelscope/models/cv/product_segmentation/net.py
new file mode 100644
index 00000000..454c99d8
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/net.py
@@ -0,0 +1,197 @@
+# The implementation here is modified based on F3Net,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Bottleneck(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 dilation=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=(3 * dilation - 1) // 2,
+            bias=False,
+            dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.downsample = downsample
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
+        out = F.relu(self.bn2(self.conv2(out)), inplace=True)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return F.relu(out + x, inplace=True)
+
+
+class ResNet(nn.Module):
+
+    def __init__(self):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self.make_layer(64, 3, stride=1, dilation=1)
+        self.layer2 = self.make_layer(128, 4, stride=2, dilation=1)
+        self.layer3 = self.make_layer(256, 6, stride=2, dilation=1)
+        self.layer4 = self.make_layer(512, 3, stride=2, dilation=1)
+
+    def make_layer(self, planes, blocks, stride, dilation):
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                self.inplanes,
+                planes * 4,
+                kernel_size=1,
+                stride=stride,
+                bias=False), nn.BatchNorm2d(planes * 4))
+        layers = [
+            Bottleneck(
+                self.inplanes, planes, stride, downsample, dilation=dilation)
+        ]
+        self.inplanes = planes * 4
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self.inplanes, planes, dilation=dilation))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.reshape(1, 3, 448, 448)
+        out1 = F.relu(self.bn1(self.conv1(x)), inplace=True)
+        out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1)
+        out2 = self.layer1(out1)
+        out3 = self.layer2(out2)
+        out4 = self.layer3(out3)
+        out5 = self.layer4(out4)
+        return out2, out3, out4, out5
+
+
+class CFM(nn.Module):
+
+    def __init__(self):
+        super(CFM, self).__init__()
+        self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn1h = nn.BatchNorm2d(64)
+        self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn2h = nn.BatchNorm2d(64)
+        self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn3h = nn.BatchNorm2d(64)
+        self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn4h = nn.BatchNorm2d(64)
+
+        self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn1v = nn.BatchNorm2d(64)
+        self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn2v = nn.BatchNorm2d(64)
+        self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn3v = nn.BatchNorm2d(64)
+        self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn4v = nn.BatchNorm2d(64)
+
+    def forward(self, left, down):
+        if down.size()[2:] != left.size()[2:]:
+            down = F.interpolate(down, size=left.size()[2:], mode='bilinear')
+        out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True)
+        out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True)
+        out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True)
+        out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True)
+        fuse = out2h * out2v
+        out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h
+        out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True)
+        out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v
+        out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True)
+        return out4h, out4v
+
+
+class Decoder(nn.Module):
+
+    def __init__(self):
+        super(Decoder, self).__init__()
+        self.cfm45 = CFM()
+        self.cfm34 = CFM()
+        self.cfm23 = CFM()
+
+    def forward(self, out2h, out3h, out4h, out5v, fback=None):
+        if fback is not None:
+            refine5 = F.interpolate(
+                fback, size=out5v.size()[2:], mode='bilinear')
+            refine4 = F.interpolate(
+                fback, size=out4h.size()[2:], mode='bilinear')
+            refine3 = F.interpolate(
+                fback, size=out3h.size()[2:], mode='bilinear')
+            refine2 = F.interpolate(
+                fback, size=out2h.size()[2:], mode='bilinear')
+            out5v = out5v + refine5
+            out4h, out4v = self.cfm45(out4h + refine4, out5v)
+            out3h, out3v = self.cfm34(out3h + refine3, out4v)
+            out2h, pred = self.cfm23(out2h + refine2, out3v)
+        else:
+            out4h, out4v = self.cfm45(out4h, out5v)
+            out3h, out3v = self.cfm34(out3h, out4v)
+            out2h, pred = self.cfm23(out2h, out3v)
+        return out2h, out3h, out4h, out5v, pred
+
+
+class F3Net(nn.Module):
+
+    def __init__(self):
+        super(F3Net, self).__init__()
+        self.bkbone = ResNet()
+        self.squeeze5 = nn.Sequential(
+            nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze4 = nn.Sequential(
+            nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze3 = nn.Sequential(
+            nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze2 = nn.Sequential(
+            nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+
+        self.decoder1 = Decoder()
+        self.decoder2 = Decoder()
+        self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+
+        self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, shape=None):
+        x = x.reshape(1, 3, 448, 448)
+        out2h, out3h, out4h, out5v = self.bkbone(x)
+        out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3(
+            out3h), self.squeeze4(out4h), self.squeeze5(out5v)
+        out2h, out3h, out4h, out5v, pred1 = self.decoder1(
+            out2h, out3h, out4h, out5v)
+        out2h, out3h, out4h, out5v, pred2 = self.decoder2(
+            out2h, out3h, out4h, out5v, pred1)
+
+        shape = x.size()[2:] if shape is None else shape
+        pred1 = F.interpolate(
+            self.linearp1(pred1), size=shape, mode='bilinear')
+        pred2 = F.interpolate(
+            self.linearp2(pred2), size=shape, mode='bilinear')
+
+        out2h = F.interpolate(
+            self.linearr2(out2h), size=shape, mode='bilinear')
+        out3h = F.interpolate(
+            self.linearr3(out3h), size=shape, mode='bilinear')
+        out4h = F.interpolate(
+            self.linearr4(out4h), size=shape, mode='bilinear')
+        out5h = F.interpolate(
+            self.linearr5(out5v), size=shape, mode='bilinear')
+        return pred1, pred2, out2h, out3h, out4h, out5h
diff --git a/modelscope/models/cv/product_segmentation/seg_infer.py b/modelscope/models/cv/product_segmentation/seg_infer.py
new file mode 100644
index 00000000..876fac66
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/seg_infer.py
@@ -0,0 +1,77 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .net import F3Net
+
+logger = get_logger()
+
+
+def load_state_dict(model_dir, device):
+    _dict = torch.load(
+        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+        map_location=device)
+    state_dict = {}
+    for k, v in _dict.items():
+        if k.startswith('module'):
+            k = k[7:]
+        state_dict[k] = v
+    return state_dict
+
+
+@MODELS.register_module(
+    Tasks.product_segmentation, module_name=Models.product_segmentation)
+class F3NetForProductSegmentation(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = F3Net()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+
+        self.params = load_state_dict(model_dir, self.device)
+        self.model.load_state_dict(self.params)
+        self.model.to(self.device)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        pred_result = self.model(x)
+        return pred_result
+
+
+mean, std = np.array([[[124.55, 118.90,
+                        102.94]]]), np.array([[[56.77, 55.97, 57.50]]])
+
+
+def inference(model, device, input_path):
+    img = Image.open(input_path)
+    img = np.array(img.convert('RGB')).astype(np.float32)
+    img = (img - mean) / std
+    img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
+    img = torch.from_numpy(img)
+    img = img.permute(2, 0, 1)
+    img = img.to(device).float()
+    outputs = model(img)
+    out = outputs[0]
+    pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy()
+    pred[pred < 20] = 0
+    pred = pred[:, :, np.newaxis]
+    pred = np.round(pred)
+    logger.info('Inference Done')
+    return pred
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 9811811e..d8d2458a 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -674,5 +674,12 @@ TASK_OUTPUTS = {
     # {
     #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
     # }
-    Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES]
+    Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],
+
+    # {
+    #     "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    # }
+    Tasks.product_segmentation: [OutputKeys.MASKS],
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index ff56658f..7fa66b5f 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -187,6 +187,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     (Pipelines.face_human_hand_detection,
      'damo/cv_nanodet_face-human-hand-detection'),
     Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
+    Tasks.product_segmentation: (Pipelines.product_segmentation,
+                                 'damo/cv_F3Net_product-segmentation'),
 }
 
 
diff --git a/modelscope/pipelines/cv/product_segmentation_pipeline.py b/modelscope/pipelines/cv/product_segmentation_pipeline.py
new file mode 100644
index 00000000..244b01d7
--- /dev/null
+++ b/modelscope/pipelines/cv/product_segmentation_pipeline.py
@@ -0,0 +1,40 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.product_segmentation import seg_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.product_segmentation, module_name=Pipelines.product_segmentation)
+class F3NetForProductSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create product segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        mask = seg_infer.inference(self.model, self.device,
+                                   input['input_path'])
+        return {OutputKeys.MASKS: mask}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4aff1d05..7968fcd1 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -45,6 +45,7 @@ class CVTasks(object):
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_product_segmentation.py b/tests/pipelines/test_product_segmentation.py
new file mode 100644
index 00000000..8f41c13c
--- /dev/null
+++ b/tests/pipelines/test_product_segmentation.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ProductSegmentationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_F3Net_product-segmentation'
+        self.input = {
+            'input_path': 'data/test/images/product_segmentation.jpg'
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        cv2.imwrite('test_product_segmentation_mask.jpg',
+                    result[OutputKeys.MASKS])
+        logger.info('test done')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        product_segmentation = pipeline(
+            Tasks.product_segmentation, model=self.model_id)
+        self.pipeline_inference(product_segmentation, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        product_segmentation = pipeline(Tasks.product_segmentation)
+        self.pipeline_inference(product_segmentation, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()